Pendrokar commited on
Commit
ee7a278
1 Parent(s): 75ba2c6

save samples to cache; load cached; a way to fix #4

Browse files
Files changed (1) hide show
  1. app.py +142 -12
app.py CHANGED
@@ -13,6 +13,20 @@ from detoxify import Detoxify
13
  import os
14
  import tempfile
15
  from pydub import AudioSegment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def match_target_amplitude(sound, target_dBFS):
18
  change_in_dBFS = target_dBFS - sound.dBFS
@@ -257,6 +271,27 @@ OVERRIDE_INPUTS = {
257
  }
258
 
259
  hf_clients = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  SPACE_ID = os.getenv('SPACE_ID')
262
  MAX_SAMPLE_TXT_LENGTH = 300
@@ -378,7 +413,9 @@ Vote to help the community find the best available text-to-speech model!
378
  INSTR = """
379
  ## 🗳️ Vote
380
 
381
- * Input text (English only) to synthesize audio (or press 🎲 for random text).
 
 
382
  * Listen to the two audio clips, one after the other.
383
  * Vote on which audio sounds more natural to you.
384
  * _Note: Model names are revealed after the vote is cast._
@@ -611,7 +648,7 @@ def make_link_to_space(model_name):
611
  model_basename = HF_SPACES[model_name]['name']
612
 
613
  if '/' in model_name:
614
- return '🤗 <a style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
615
 
616
  # otherwise just return the model name
617
  return model_name
@@ -989,6 +1026,26 @@ def synthandreturn(text):
989
  #debug
990
  # outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
991
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
992
  print(f"Retrieving models {mdl1k} and {mdl2k} from API")
993
  return (
994
  text,
@@ -1046,26 +1103,94 @@ def unlock_vote(btn_index, aplayed, bplayed):
1046
 
1047
  return [gr.update(), gr.update(), aplayed, bplayed]
1048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
  def randomsent():
1050
- return random.choice(sents), '🎲'
1051
  def clear_stuff():
1052
- return "", "Synthesize", gr.update(visible=False), '', '', gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1053
  def disable():
1054
  return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
1055
  def enable():
1056
  return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
1057
  with gr.Blocks() as vote:
1058
  # sample played
1059
- #aplayed = gr.State(value=False)
1060
- #bplayed = gr.State(value=False)
1061
  # voter ID
1062
  useridstate = gr.State()
1063
  gr.Markdown(INSTR)
1064
  with gr.Group():
1065
  with gr.Row():
 
1066
  text = gr.Textbox(container=False, show_label=False, placeholder="Enter text to synthesize", lines=1, max_lines=1, scale=9999999, min_width=0)
1067
  randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
1068
- randomt.click(randomsent, outputs=[text, randomt])
1069
  btn = gr.Button("Synthesize", variant='primary')
1070
  model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
1071
  #model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=True)
@@ -1096,7 +1221,9 @@ with gr.Blocks() as vote:
1096
  bbetter,
1097
  prevmodel1,
1098
  prevmodel2,
1099
- nxtroundbtn
 
 
1100
  ]
1101
  """
1102
  text,
@@ -1111,12 +1238,15 @@ with gr.Blocks() as vote:
1111
  gr.update(visible=False), #prevmodel1
1112
  gr.update(visible=False), #prevmodel2
1113
  gr.update(visible=False), #nxt round btn"""
1114
- btn.click(disable, outputs=[btn, abetter, bbetter]).then(synthandreturn, inputs=[text], outputs=outputs).then(enable, outputs=[btn, abetter, bbetter])
1115
- nxtroundbtn.click(clear_stuff, outputs=outputs)
 
 
 
1116
 
1117
  # Allow interaction with the vote buttons only when both audio samples have finished playing
1118
- #aud1.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=0), aplayed, bplayed])
1119
- #aud2.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=1), aplayed, bplayed])
1120
 
1121
  # nxt_outputs = [prevmodel1, prevmodel2, abetter, bbetter]
1122
  nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
 
13
  import os
14
  import tempfile
15
  from pydub import AudioSegment
16
+ import itertools
17
+ from typing import List, Tuple, Set, Dict
18
+ from hashlib import sha1
19
+
20
+ class User:
21
+ def __init__(self, user_id: str):
22
+ self.user_id = user_id
23
+ self.voted_pairs: Set[Tuple[str, str]] = set()
24
+
25
+ class Sample:
26
+ def __init__(self, filename: str, transcript: str, modelName: str):
27
+ self.filename = filename
28
+ self.transcript = transcript
29
+ self.modelName = modelName
30
 
31
  def match_target_amplitude(sound, target_dBFS):
32
  change_in_dBFS = target_dBFS - sound.dBFS
 
271
  }
272
 
273
  hf_clients = {}
274
+ # cache audio samples for quick voting
275
+ cached_samples: List[Sample] = []
276
+ voting_users = {
277
+ # userid as the key and USER() as the value
278
+ }
279
+
280
+ def generate_matching_pairs(samples: List[Sample]) -> List[Tuple[Sample, Sample]]:
281
+ transcript_groups: Dict[str, List[Sample]] = {}
282
+ for sample in samples:
283
+ if sample.transcript not in transcript_groups:
284
+ transcript_groups[sample.transcript] = []
285
+ transcript_groups[sample.transcript].append(sample)
286
+
287
+ matching_pairs: List[Tuple[Sample, Sample]] = []
288
+ for group in transcript_groups.values():
289
+ matching_pairs.extend(list(itertools.combinations(group, 2)))
290
+
291
+ return matching_pairs
292
+
293
+ # List[Tuple[Sample, Sample]]
294
+ all_pairs = []
295
 
296
  SPACE_ID = os.getenv('SPACE_ID')
297
  MAX_SAMPLE_TXT_LENGTH = 300
 
413
  INSTR = """
414
  ## 🗳️ Vote
415
 
416
+ * Input text (English only) to synthesize audio.
417
+ * Press ⚡ to select a cached sample you have yet to vote on. Fast.
418
+ * Press 🎲 to randomly select text for a list. Slow.
419
  * Listen to the two audio clips, one after the other.
420
  * Vote on which audio sounds more natural to you.
421
  * _Note: Model names are revealed after the vote is cast._
 
648
  model_basename = HF_SPACES[model_name]['name']
649
 
650
  if '/' in model_name:
651
+ return '🤗 <a target="_top" style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
652
 
653
  # otherwise just return the model name
654
  return model_name
 
1026
  #debug
1027
  # outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
1028
 
1029
+ # cache the result
1030
+ for model in [mdl1k, mdl2k]:
1031
+ already_cached = False
1032
+ # check if already cached
1033
+ for cached_sample in cached_samples:
1034
+ # TODO:replace cached
1035
+ if (cached_sample.transcript == text and cached_sample.modelName == model):
1036
+ already_cached = True
1037
+ break
1038
+
1039
+ if (already_cached):
1040
+ continue
1041
+
1042
+ print(f"Cached {model}")
1043
+ cached_samples.append(Sample(results[model], text, model))
1044
+ # print(cached_samples)
1045
+
1046
+ all_pairs = generate_matching_pairs(cached_samples)
1047
+ # print(all_pairs)
1048
+
1049
  print(f"Retrieving models {mdl1k} and {mdl2k} from API")
1050
  return (
1051
  text,
 
1103
 
1104
  return [gr.update(), gr.update(), aplayed, bplayed]
1105
 
1106
+ def cachedsent(request: gr.Request):
1107
+ # add new userid to voting_users from Browser session hash
1108
+ # stored only in RAM
1109
+ if request.username:
1110
+ print('auth by username')
1111
+ # by HuggingFace username
1112
+ userid = sha1(bytes(request.username.encode('ascii'))).hexdigest()
1113
+ else:
1114
+ print('auth by ip')
1115
+ # by IP address
1116
+ userid = sha1(bytes(request.client.host.encode('ascii'))).hexdigest()
1117
+ # by browser session hash
1118
+ # userid = sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest() # Session hash changes on page reload
1119
+
1120
+ if userid not in voting_users:
1121
+ voting_users[userid] = User(userid)
1122
+
1123
+ def get_next_pair(user: User):
1124
+ # all_pairs = generate_matching_pairs(cached_samples)
1125
+
1126
+ # for pair in all_pairs:
1127
+ for pair in generate_matching_pairs(cached_samples):
1128
+ pair_key = (pair[0].filename, pair[1].filename)
1129
+ if pair_key not in user.voted_pairs and (pair_key[1], pair_key[0]) not in user.voted_pairs:
1130
+ return pair
1131
+ return None
1132
+
1133
+ pair = get_next_pair(voting_users[userid])
1134
+ if pair is None:
1135
+ return [*clear_stuff(), gr.update(interactive=False)]
1136
+
1137
+ # TODO: move to abisbetter
1138
+ voting_users[userid].voted_pairs.add((pair[0].filename, pair[1].filename))
1139
+ return (
1140
+ pair[0].transcript,
1141
+ "Synthesize",
1142
+ gr.update(visible=True), # r2
1143
+ pair[0].modelName, # model1
1144
+ pair[1].modelName, # model2
1145
+ gr.update(visible=True, value=pair[0].filename), # aud1
1146
+ gr.update(visible=True, value=pair[1].filename), # aud2
1147
+ gr.update(visible=True, interactive=False), #abetter
1148
+ gr.update(visible=True, interactive=False), #bbetter
1149
+ gr.update(visible=False), #prevmodel1
1150
+ gr.update(visible=False), #prevmodel2
1151
+ gr.update(visible=False), #nxt round btn
1152
+ # reset aplayed, bplayed audio playback events
1153
+ gr.update(value=False), #aplayed
1154
+ gr.update(value=False), #bplayed
1155
+ # fetch cached btn
1156
+ gr.update(interactive=True)
1157
+ )
1158
  def randomsent():
1159
+ return '⚡', random.choice(sents), '🎲'
1160
  def clear_stuff():
1161
+ return [
1162
+ '',
1163
+ "Synthesize",
1164
+ gr.update(visible=True), # r2
1165
+ '', # model1
1166
+ '', # model2
1167
+ gr.update(visible=False), # aud1
1168
+ gr.update(visible=False), # aud2
1169
+ gr.update(visible=False), #abetter
1170
+ gr.update(visible=False), #bbetter
1171
+ gr.update(visible=False), #prevmodel1
1172
+ gr.update(visible=False), #prevmodel2
1173
+ gr.update(visible=False), #nxt round btn
1174
+ gr.update(value=False), #aplayed
1175
+ gr.update(value=False), #bplayed
1176
+ ]
1177
  def disable():
1178
  return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
1179
  def enable():
1180
  return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
1181
  with gr.Blocks() as vote:
1182
  # sample played
1183
+ aplayed = gr.State(value=False)
1184
+ bplayed = gr.State(value=False)
1185
  # voter ID
1186
  useridstate = gr.State()
1187
  gr.Markdown(INSTR)
1188
  with gr.Group():
1189
  with gr.Row():
1190
+ cachedt = gr.Button('⚡', scale=0, min_width=0, variant='tool', interactive=len(cached_samples)>0)
1191
  text = gr.Textbox(container=False, show_label=False, placeholder="Enter text to synthesize", lines=1, max_lines=1, scale=9999999, min_width=0)
1192
  randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
1193
+ randomt.click(randomsent, outputs=[cachedt, text, randomt])
1194
  btn = gr.Button("Synthesize", variant='primary')
1195
  model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
1196
  #model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=True)
 
1221
  bbetter,
1222
  prevmodel1,
1223
  prevmodel2,
1224
+ nxtroundbtn,
1225
+ aplayed,
1226
+ bplayed,
1227
  ]
1228
  """
1229
  text,
 
1238
  gr.update(visible=False), #prevmodel1
1239
  gr.update(visible=False), #prevmodel2
1240
  gr.update(visible=False), #nxt round btn"""
1241
+ btn.click(disable, outputs=[btn, abetter, bbetter]).then(synthandreturn, inputs=[text], outputs=outputs).then(enable, outputs=[btn, gr.State(), gr.State()])
1242
+ nxtroundbtn.click(cachedsent, outputs=[*outputs, cachedt])
1243
+
1244
+ # fetch a comparison pair from cache
1245
+ cachedt.click(disable, outputs=[cachedt, abetter, bbetter]).then(cachedsent, outputs=[*outputs, cachedt]).then(enable, outputs=[btn, gr.State(), gr.State()])
1246
 
1247
  # Allow interaction with the vote buttons only when both audio samples have finished playing
1248
+ aud1.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=0), aplayed, bplayed])
1249
+ aud2.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=1), aplayed, bplayed])
1250
 
1251
  # nxt_outputs = [prevmodel1, prevmodel2, abetter, bbetter]
1252
  nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]