Spaces:
Running
Running
save samples to cache; load cached; a way to fix #4
Browse files
app.py
CHANGED
@@ -13,6 +13,20 @@ from detoxify import Detoxify
|
|
13 |
import os
|
14 |
import tempfile
|
15 |
from pydub import AudioSegment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def match_target_amplitude(sound, target_dBFS):
|
18 |
change_in_dBFS = target_dBFS - sound.dBFS
|
@@ -257,6 +271,27 @@ OVERRIDE_INPUTS = {
|
|
257 |
}
|
258 |
|
259 |
hf_clients = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
SPACE_ID = os.getenv('SPACE_ID')
|
262 |
MAX_SAMPLE_TXT_LENGTH = 300
|
@@ -378,7 +413,9 @@ Vote to help the community find the best available text-to-speech model!
|
|
378 |
INSTR = """
|
379 |
## 🗳️ Vote
|
380 |
|
381 |
-
* Input text (English only) to synthesize audio
|
|
|
|
|
382 |
* Listen to the two audio clips, one after the other.
|
383 |
* Vote on which audio sounds more natural to you.
|
384 |
* _Note: Model names are revealed after the vote is cast._
|
@@ -611,7 +648,7 @@ def make_link_to_space(model_name):
|
|
611 |
model_basename = HF_SPACES[model_name]['name']
|
612 |
|
613 |
if '/' in model_name:
|
614 |
-
return '🤗 <a style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
|
615 |
|
616 |
# otherwise just return the model name
|
617 |
return model_name
|
@@ -989,6 +1026,26 @@ def synthandreturn(text):
|
|
989 |
#debug
|
990 |
# outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|
991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
992 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
993 |
return (
|
994 |
text,
|
@@ -1046,26 +1103,94 @@ def unlock_vote(btn_index, aplayed, bplayed):
|
|
1046 |
|
1047 |
return [gr.update(), gr.update(), aplayed, bplayed]
|
1048 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1049 |
def randomsent():
|
1050 |
-
return random.choice(sents), '🎲'
|
1051 |
def clear_stuff():
|
1052 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1053 |
def disable():
|
1054 |
return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
|
1055 |
def enable():
|
1056 |
return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
|
1057 |
with gr.Blocks() as vote:
|
1058 |
# sample played
|
1059 |
-
|
1060 |
-
|
1061 |
# voter ID
|
1062 |
useridstate = gr.State()
|
1063 |
gr.Markdown(INSTR)
|
1064 |
with gr.Group():
|
1065 |
with gr.Row():
|
|
|
1066 |
text = gr.Textbox(container=False, show_label=False, placeholder="Enter text to synthesize", lines=1, max_lines=1, scale=9999999, min_width=0)
|
1067 |
randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
|
1068 |
-
randomt.click(randomsent, outputs=[text, randomt])
|
1069 |
btn = gr.Button("Synthesize", variant='primary')
|
1070 |
model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
|
1071 |
#model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=True)
|
@@ -1096,7 +1221,9 @@ with gr.Blocks() as vote:
|
|
1096 |
bbetter,
|
1097 |
prevmodel1,
|
1098 |
prevmodel2,
|
1099 |
-
nxtroundbtn
|
|
|
|
|
1100 |
]
|
1101 |
"""
|
1102 |
text,
|
@@ -1111,12 +1238,15 @@ with gr.Blocks() as vote:
|
|
1111 |
gr.update(visible=False), #prevmodel1
|
1112 |
gr.update(visible=False), #prevmodel2
|
1113 |
gr.update(visible=False), #nxt round btn"""
|
1114 |
-
btn.click(disable, outputs=[btn, abetter, bbetter]).then(synthandreturn, inputs=[text], outputs=outputs).then(enable, outputs=[btn,
|
1115 |
-
nxtroundbtn.click(
|
|
|
|
|
|
|
1116 |
|
1117 |
# Allow interaction with the vote buttons only when both audio samples have finished playing
|
1118 |
-
|
1119 |
-
|
1120 |
|
1121 |
# nxt_outputs = [prevmodel1, prevmodel2, abetter, bbetter]
|
1122 |
nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|
|
|
13 |
import os
|
14 |
import tempfile
|
15 |
from pydub import AudioSegment
|
16 |
+
import itertools
|
17 |
+
from typing import List, Tuple, Set, Dict
|
18 |
+
from hashlib import sha1
|
19 |
+
|
20 |
+
class User:
|
21 |
+
def __init__(self, user_id: str):
|
22 |
+
self.user_id = user_id
|
23 |
+
self.voted_pairs: Set[Tuple[str, str]] = set()
|
24 |
+
|
25 |
+
class Sample:
|
26 |
+
def __init__(self, filename: str, transcript: str, modelName: str):
|
27 |
+
self.filename = filename
|
28 |
+
self.transcript = transcript
|
29 |
+
self.modelName = modelName
|
30 |
|
31 |
def match_target_amplitude(sound, target_dBFS):
|
32 |
change_in_dBFS = target_dBFS - sound.dBFS
|
|
|
271 |
}
|
272 |
|
273 |
hf_clients = {}
|
274 |
+
# cache audio samples for quick voting
|
275 |
+
cached_samples: List[Sample] = []
|
276 |
+
voting_users = {
|
277 |
+
# userid as the key and USER() as the value
|
278 |
+
}
|
279 |
+
|
280 |
+
def generate_matching_pairs(samples: List[Sample]) -> List[Tuple[Sample, Sample]]:
|
281 |
+
transcript_groups: Dict[str, List[Sample]] = {}
|
282 |
+
for sample in samples:
|
283 |
+
if sample.transcript not in transcript_groups:
|
284 |
+
transcript_groups[sample.transcript] = []
|
285 |
+
transcript_groups[sample.transcript].append(sample)
|
286 |
+
|
287 |
+
matching_pairs: List[Tuple[Sample, Sample]] = []
|
288 |
+
for group in transcript_groups.values():
|
289 |
+
matching_pairs.extend(list(itertools.combinations(group, 2)))
|
290 |
+
|
291 |
+
return matching_pairs
|
292 |
+
|
293 |
+
# List[Tuple[Sample, Sample]]
|
294 |
+
all_pairs = []
|
295 |
|
296 |
SPACE_ID = os.getenv('SPACE_ID')
|
297 |
MAX_SAMPLE_TXT_LENGTH = 300
|
|
|
413 |
INSTR = """
|
414 |
## 🗳️ Vote
|
415 |
|
416 |
+
* Input text (English only) to synthesize audio.
|
417 |
+
* Press ⚡ to select a cached sample you have yet to vote on. Fast.
|
418 |
+
* Press 🎲 to randomly select text for a list. Slow.
|
419 |
* Listen to the two audio clips, one after the other.
|
420 |
* Vote on which audio sounds more natural to you.
|
421 |
* _Note: Model names are revealed after the vote is cast._
|
|
|
648 |
model_basename = HF_SPACES[model_name]['name']
|
649 |
|
650 |
if '/' in model_name:
|
651 |
+
return '🤗 <a target="_top" style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
|
652 |
|
653 |
# otherwise just return the model name
|
654 |
return model_name
|
|
|
1026 |
#debug
|
1027 |
# outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|
1028 |
|
1029 |
+
# cache the result
|
1030 |
+
for model in [mdl1k, mdl2k]:
|
1031 |
+
already_cached = False
|
1032 |
+
# check if already cached
|
1033 |
+
for cached_sample in cached_samples:
|
1034 |
+
# TODO:replace cached
|
1035 |
+
if (cached_sample.transcript == text and cached_sample.modelName == model):
|
1036 |
+
already_cached = True
|
1037 |
+
break
|
1038 |
+
|
1039 |
+
if (already_cached):
|
1040 |
+
continue
|
1041 |
+
|
1042 |
+
print(f"Cached {model}")
|
1043 |
+
cached_samples.append(Sample(results[model], text, model))
|
1044 |
+
# print(cached_samples)
|
1045 |
+
|
1046 |
+
all_pairs = generate_matching_pairs(cached_samples)
|
1047 |
+
# print(all_pairs)
|
1048 |
+
|
1049 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
1050 |
return (
|
1051 |
text,
|
|
|
1103 |
|
1104 |
return [gr.update(), gr.update(), aplayed, bplayed]
|
1105 |
|
1106 |
+
def cachedsent(request: gr.Request):
|
1107 |
+
# add new userid to voting_users from Browser session hash
|
1108 |
+
# stored only in RAM
|
1109 |
+
if request.username:
|
1110 |
+
print('auth by username')
|
1111 |
+
# by HuggingFace username
|
1112 |
+
userid = sha1(bytes(request.username.encode('ascii'))).hexdigest()
|
1113 |
+
else:
|
1114 |
+
print('auth by ip')
|
1115 |
+
# by IP address
|
1116 |
+
userid = sha1(bytes(request.client.host.encode('ascii'))).hexdigest()
|
1117 |
+
# by browser session hash
|
1118 |
+
# userid = sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest() # Session hash changes on page reload
|
1119 |
+
|
1120 |
+
if userid not in voting_users:
|
1121 |
+
voting_users[userid] = User(userid)
|
1122 |
+
|
1123 |
+
def get_next_pair(user: User):
|
1124 |
+
# all_pairs = generate_matching_pairs(cached_samples)
|
1125 |
+
|
1126 |
+
# for pair in all_pairs:
|
1127 |
+
for pair in generate_matching_pairs(cached_samples):
|
1128 |
+
pair_key = (pair[0].filename, pair[1].filename)
|
1129 |
+
if pair_key not in user.voted_pairs and (pair_key[1], pair_key[0]) not in user.voted_pairs:
|
1130 |
+
return pair
|
1131 |
+
return None
|
1132 |
+
|
1133 |
+
pair = get_next_pair(voting_users[userid])
|
1134 |
+
if pair is None:
|
1135 |
+
return [*clear_stuff(), gr.update(interactive=False)]
|
1136 |
+
|
1137 |
+
# TODO: move to abisbetter
|
1138 |
+
voting_users[userid].voted_pairs.add((pair[0].filename, pair[1].filename))
|
1139 |
+
return (
|
1140 |
+
pair[0].transcript,
|
1141 |
+
"Synthesize",
|
1142 |
+
gr.update(visible=True), # r2
|
1143 |
+
pair[0].modelName, # model1
|
1144 |
+
pair[1].modelName, # model2
|
1145 |
+
gr.update(visible=True, value=pair[0].filename), # aud1
|
1146 |
+
gr.update(visible=True, value=pair[1].filename), # aud2
|
1147 |
+
gr.update(visible=True, interactive=False), #abetter
|
1148 |
+
gr.update(visible=True, interactive=False), #bbetter
|
1149 |
+
gr.update(visible=False), #prevmodel1
|
1150 |
+
gr.update(visible=False), #prevmodel2
|
1151 |
+
gr.update(visible=False), #nxt round btn
|
1152 |
+
# reset aplayed, bplayed audio playback events
|
1153 |
+
gr.update(value=False), #aplayed
|
1154 |
+
gr.update(value=False), #bplayed
|
1155 |
+
# fetch cached btn
|
1156 |
+
gr.update(interactive=True)
|
1157 |
+
)
|
1158 |
def randomsent():
|
1159 |
+
return '⚡', random.choice(sents), '🎲'
|
1160 |
def clear_stuff():
|
1161 |
+
return [
|
1162 |
+
'',
|
1163 |
+
"Synthesize",
|
1164 |
+
gr.update(visible=True), # r2
|
1165 |
+
'', # model1
|
1166 |
+
'', # model2
|
1167 |
+
gr.update(visible=False), # aud1
|
1168 |
+
gr.update(visible=False), # aud2
|
1169 |
+
gr.update(visible=False), #abetter
|
1170 |
+
gr.update(visible=False), #bbetter
|
1171 |
+
gr.update(visible=False), #prevmodel1
|
1172 |
+
gr.update(visible=False), #prevmodel2
|
1173 |
+
gr.update(visible=False), #nxt round btn
|
1174 |
+
gr.update(value=False), #aplayed
|
1175 |
+
gr.update(value=False), #bplayed
|
1176 |
+
]
|
1177 |
def disable():
|
1178 |
return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
|
1179 |
def enable():
|
1180 |
return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
|
1181 |
with gr.Blocks() as vote:
|
1182 |
# sample played
|
1183 |
+
aplayed = gr.State(value=False)
|
1184 |
+
bplayed = gr.State(value=False)
|
1185 |
# voter ID
|
1186 |
useridstate = gr.State()
|
1187 |
gr.Markdown(INSTR)
|
1188 |
with gr.Group():
|
1189 |
with gr.Row():
|
1190 |
+
cachedt = gr.Button('⚡', scale=0, min_width=0, variant='tool', interactive=len(cached_samples)>0)
|
1191 |
text = gr.Textbox(container=False, show_label=False, placeholder="Enter text to synthesize", lines=1, max_lines=1, scale=9999999, min_width=0)
|
1192 |
randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
|
1193 |
+
randomt.click(randomsent, outputs=[cachedt, text, randomt])
|
1194 |
btn = gr.Button("Synthesize", variant='primary')
|
1195 |
model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
|
1196 |
#model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=True)
|
|
|
1221 |
bbetter,
|
1222 |
prevmodel1,
|
1223 |
prevmodel2,
|
1224 |
+
nxtroundbtn,
|
1225 |
+
aplayed,
|
1226 |
+
bplayed,
|
1227 |
]
|
1228 |
"""
|
1229 |
text,
|
|
|
1238 |
gr.update(visible=False), #prevmodel1
|
1239 |
gr.update(visible=False), #prevmodel2
|
1240 |
gr.update(visible=False), #nxt round btn"""
|
1241 |
+
btn.click(disable, outputs=[btn, abetter, bbetter]).then(synthandreturn, inputs=[text], outputs=outputs).then(enable, outputs=[btn, gr.State(), gr.State()])
|
1242 |
+
nxtroundbtn.click(cachedsent, outputs=[*outputs, cachedt])
|
1243 |
+
|
1244 |
+
# fetch a comparison pair from cache
|
1245 |
+
cachedt.click(disable, outputs=[cachedt, abetter, bbetter]).then(cachedsent, outputs=[*outputs, cachedt]).then(enable, outputs=[btn, gr.State(), gr.State()])
|
1246 |
|
1247 |
# Allow interaction with the vote buttons only when both audio samples have finished playing
|
1248 |
+
aud1.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=0), aplayed, bplayed])
|
1249 |
+
aud2.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=1), aplayed, bplayed])
|
1250 |
|
1251 |
# nxt_outputs = [prevmodel1, prevmodel2, abetter, bbetter]
|
1252 |
nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|