Pendrokar commited on
Commit
1e7417b
·
1 Parent(s): 09d8f48

Added Fish Speech TTS

Browse files
Files changed (1) hide show
  1. app.py +34 -5
app.py CHANGED
@@ -72,6 +72,7 @@ AVAILABLE_MODELS = {
72
  # 'coqui/CoquiTTS': 'coqui/CoquiTTS',
73
  'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # 4.29.0
74
  'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29.0
 
75
 
76
  # Parler
77
  'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29.0 4.42.0
@@ -186,6 +187,13 @@ HF_SPACES = {
186
  'return_audio_index': 0,
187
  },
188
 
 
 
 
 
 
 
 
189
  # TTS w issues
190
  # 'PolyAI/pheme': '/predict#0', #sleepy HF Space
191
  # 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
@@ -200,6 +208,7 @@ HF_SPACES = {
200
  # for zero-shot TTS - voice sample of Scarlett Johanson
201
  DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/iKHHqWxWy6Zfmp6QP6CZZ.wav'
202
  DEFAULT_VOICE_SAMPLE = file(DEFAULT_VOICE_SAMPLE_STR)
 
203
 
204
  OVERRIDE_INPUTS = {
205
  'coqui/xtts': {
@@ -269,6 +278,19 @@ OVERRIDE_INPUTS = {
269
  2: 0, # pace rate
270
  3: 0, # pitch
271
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  }
273
 
274
  hf_clients = {}
@@ -693,7 +715,7 @@ def downvote_model(model, uname):
693
  cursor.close()
694
 
695
  def a_is_better(model1, model2, userid):
696
- print("A is better", model1, model2)
697
  if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
698
  raise gr.Error('Sorry, please try voting again.')
699
  userid = mkuuid(userid)
@@ -708,7 +730,7 @@ def a_is_better(model1, model2, userid):
708
  downvote_model(model2, str(userid))
709
  return reload(model1, model2, userid, chose_a=True)
710
  def b_is_better(model1, model2, userid):
711
- print("B is better", model1, model2)
712
  if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
713
  raise gr.Error('Sorry, please try voting again.')
714
  userid = mkuuid(userid)
@@ -931,7 +953,13 @@ def synthandreturn(text):
931
  results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
932
 
933
  # return path to audio
934
- result = results[return_audio_index] if (not isinstance(results, str)) else results
 
 
 
 
 
 
935
  else:
936
  # Use the private HF Space
937
  result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
@@ -969,6 +997,7 @@ def synthandreturn(text):
969
  os.unlink(result)
970
  result = f.name
971
  except:
 
972
  pass
973
  if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
974
  result_storage[model] = result
@@ -1104,7 +1133,7 @@ def unlock_vote(btn_index, aplayed, bplayed):
1104
 
1105
  # both audio samples played
1106
  if bool(aplayed) and bool(bplayed):
1107
- print('Both audio samples played, voting unlocked')
1108
  return [gr.update(interactive=True), gr.update(interactive=True), True, True]
1109
 
1110
  return [gr.update(), gr.update(), aplayed, bplayed]
@@ -1309,7 +1338,7 @@ with gr.Blocks() as vote:
1309
  .then(
1310
  None,
1311
  inputs=[bplayed],
1312
- js="(b) => {console.log(b); b ? 0 : document.querySelector('.stretch .gap+.gap button.play-pause-button').click()}",
1313
  )
1314
  # autoplay if unplayed
1315
  aud2\
 
72
  # 'coqui/CoquiTTS': 'coqui/CoquiTTS',
73
  'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # 4.29.0
74
  'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29.0
75
+ 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29.0
76
 
77
  # Parler
78
  'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29.0 4.42.0
 
187
  'return_audio_index': 0,
188
  },
189
 
190
+ 'fishaudio/fish-speech-1': {
191
+ 'name': 'Fish Speech',
192
+ 'function': '/inference_wrapper',
193
+ 'text_param_index': 0,
194
+ 'return_audio_index': 1,
195
+ },
196
+
197
  # TTS w issues
198
  # 'PolyAI/pheme': '/predict#0', #sleepy HF Space
199
  # 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
 
208
  # for zero-shot TTS - voice sample of Scarlett Johanson
209
  DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/iKHHqWxWy6Zfmp6QP6CZZ.wav'
210
  DEFAULT_VOICE_SAMPLE = file(DEFAULT_VOICE_SAMPLE_STR)
211
+ DEFAULT_VOICE_TRANSCRIPT = "In the first half of the 20th century, science fiction familiarized the world with the concept of artificially intelligent robots. It began with the “heartless” Tin man from the Wizard of Oz and continued with the humanoid robot that impersonated Maria in Metropolis. By the 1950s, we had a generation of scientists, mathematicians, and philosophers with the concept of artificial intelligence (or AI) culturally assimilated in their minds."
212
 
213
  OVERRIDE_INPUTS = {
214
  'coqui/xtts': {
 
278
  2: 0, # pace rate
279
  3: 0, # pitch
280
  },
281
+
282
+ 'fishaudio/fish-speech-1': {
283
+ 1: True, # enable_reference_audio
284
+ 2: DEFAULT_VOICE_SAMPLE, # reference_audio
285
+ 3: DEFAULT_VOICE_TRANSCRIPT, # reference_text
286
+ 4: 1024, # max_new_tokens
287
+ 5: 200, # chunk_length
288
+ 6: 0.7, # top_p
289
+ 7: 1.2, # repetition_penalty
290
+ 8: 0.7, # temperature
291
+ 9: 1, # batch_infer_num
292
+ 10: False, # if_load_asr_model
293
+ },
294
  }
295
 
296
  hf_clients = {}
 
715
  cursor.close()
716
 
717
  def a_is_better(model1, model2, userid):
718
+ # print("A is better", model1, model2)
719
  if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
720
  raise gr.Error('Sorry, please try voting again.')
721
  userid = mkuuid(userid)
 
730
  downvote_model(model2, str(userid))
731
  return reload(model1, model2, userid, chose_a=True)
732
  def b_is_better(model1, model2, userid):
733
+ # print("B is better", model1, model2)
734
  if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
735
  raise gr.Error('Sorry, please try voting again.')
736
  userid = mkuuid(userid)
 
953
  results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
954
 
955
  # return path to audio
956
+ result = results
957
+ if (not isinstance(results, str)):
958
+ # return_audio_index may be a filepath string
959
+ result = results[return_audio_index]
960
+ if (isinstance(result, dict)):
961
+ # return_audio_index is a dictionary
962
+ result = results[return_audio_index]['value']
963
  else:
964
  # Use the private HF Space
965
  result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
 
997
  os.unlink(result)
998
  result = f.name
999
  except:
1000
+ print(f"{model}: [WARN] Unable to resample audio")
1001
  pass
1002
  if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
1003
  result_storage[model] = result
 
1133
 
1134
  # both audio samples played
1135
  if bool(aplayed) and bool(bplayed):
1136
+ # print('Both audio samples played, voting unlocked')
1137
  return [gr.update(interactive=True), gr.update(interactive=True), True, True]
1138
 
1139
  return [gr.update(), gr.update(), aplayed, bplayed]
 
1338
  .then(
1339
  None,
1340
  inputs=[bplayed],
1341
+ js="(b) => b ? 0 : document.querySelector('.stretch .gap+.gap button.play-pause-button').click()",
1342
  )
1343
  # autoplay if unplayed
1344
  aud2\