Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,8 @@ import os
|
|
9 |
|
10 |
#from diffusers import StableDiffusionPipeline
|
11 |
|
12 |
-
|
|
|
13 |
### ββββββββββββββββββββββββββββββββββββββββ
|
14 |
|
15 |
title="Draw Me an Insect π /Dessine-moi un insecte π"
|
@@ -32,11 +33,11 @@ def get_images(prompt):
|
|
32 |
|
33 |
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
|
34 |
|
35 |
-
whisper_results =
|
36 |
-
prompt = whisper_results[
|
37 |
images = get_images(prompt)
|
38 |
|
39 |
-
return whisper_results[0], whisper_results[1],
|
40 |
|
41 |
#def diffuse(prompt, guidance_scale, nb_iterations, seed):
|
42 |
#
|
@@ -75,40 +76,19 @@ def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
|
|
75 |
#
|
76 |
# return images
|
77 |
|
78 |
-
def
|
79 |
print("""
|
80 |
β
|
81 |
Sending audio to Whisper ...
|
82 |
β
|
83 |
""")
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
88 |
-
print('DateTime String:', date_time_str)
|
89 |
-
|
90 |
-
audio = whisper.load_audio(audio)
|
91 |
-
audio = whisper.pad_or_trim(audio)
|
92 |
-
|
93 |
-
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
|
94 |
-
|
95 |
-
_, probs = whisper_model.detect_language(mel)
|
96 |
-
|
97 |
-
transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
|
98 |
-
translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
|
99 |
-
|
100 |
-
transcription = whisper.decode(whisper_model, mel, transcript_options)
|
101 |
-
translation = whisper.decode(whisper_model, mel, translate_options)
|
102 |
-
|
103 |
-
print("language spoken: " + transcription.language)
|
104 |
-
print("transcript: " + transcription.text)
|
105 |
print("βββββββββββββββββββββββββββββββββββββββββββ")
|
106 |
-
print("translated: " +
|
107 |
-
|
108 |
-
|
109 |
-
else:
|
110 |
-
tr_flag = flag.flag(transcription.language)
|
111 |
-
return tr_flag, transcription.text, translation.text
|
112 |
|
113 |
### ββββββββββββββββββββββββββββββββββββββββ
|
114 |
|
@@ -295,8 +275,7 @@ with gr.Blocks(css=css) as demo:
|
|
295 |
|
296 |
"""
|
297 |
)
|
298 |
-
|
299 |
-
with gr.Tab(label="Record/Enregistrer", elem_id="record_tab"):
|
300 |
with gr.Column():
|
301 |
record_input = gr.Audio(
|
302 |
source="microphone",
|
@@ -320,6 +299,7 @@ with gr.Blocks(css=css) as demo:
|
|
320 |
audio_u_translate = gr.Button("Check the transcription/VΓ©rifier la transcription π", elem_id="check_btn_2")
|
321 |
audio_u_direct_sd = gr.Button("Generate the image right now! / GΓ©nerer l'image directement! ποΈ", elem_id="magic_btn_2")
|
322 |
|
|
|
323 |
with gr.Accordion(label="Image generation Settings/Configuration de gΓ©nΓ©ration d'image", elem_id="sd_settings", visible=False):
|
324 |
with gr.Row():
|
325 |
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
|
@@ -335,21 +315,21 @@ with gr.Blocks(css=css) as demo:
|
|
335 |
with gr.Row():
|
336 |
|
337 |
transcripted_output = gr.Textbox(
|
338 |
-
label="Transcription",
|
339 |
lines=3,
|
340 |
elem_id="transcripted"
|
341 |
)
|
342 |
-
language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
|
343 |
|
344 |
with gr.Column():
|
345 |
translated_output = gr.Textbox(
|
346 |
-
label="
|
347 |
lines=4,
|
348 |
elem_id="translated"
|
349 |
)
|
350 |
with gr.Row():
|
351 |
clear_btn = gr.Button(value="Clear")
|
352 |
-
diffuse_btn = gr.Button(value="
|
353 |
|
354 |
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
|
355 |
|
@@ -407,18 +387,18 @@ with gr.Blocks(css=css) as demo:
|
|
407 |
|
408 |
""", elem_id="about")
|
409 |
|
410 |
-
|
411 |
inputs = record_input,
|
412 |
outputs = [
|
413 |
-
language_detected_output,
|
414 |
transcripted_output,
|
415 |
translated_output
|
416 |
])
|
417 |
|
418 |
-
audio_u_translate.click(
|
419 |
inputs = upload_input,
|
420 |
outputs = [
|
421 |
-
language_detected_output,
|
422 |
transcripted_output,
|
423 |
translated_output
|
424 |
])
|
@@ -431,7 +411,7 @@ with gr.Blocks(css=css) as demo:
|
|
431 |
seed
|
432 |
],
|
433 |
outputs = [
|
434 |
-
language_detected_output,
|
435 |
transcripted_output,
|
436 |
translated_output,
|
437 |
sd_output
|
@@ -445,7 +425,7 @@ with gr.Blocks(css=css) as demo:
|
|
445 |
seed
|
446 |
],
|
447 |
outputs = [
|
448 |
-
language_detected_output,
|
449 |
transcripted_output,
|
450 |
translated_output,
|
451 |
sd_output
|
@@ -456,7 +436,7 @@ with gr.Blocks(css=css) as demo:
|
|
456 |
translated_output
|
457 |
],
|
458 |
outputs = sd_output
|
459 |
-
|
460 |
gr.HTML('''
|
461 |
<div class="footer">
|
462 |
<p> This Space is based on the <a href="https://huggingface.co/spaces/fffiloni/whisper-to-stable-diffusion" target="_blank">Whisper to Stable Diffusion Space</a> created by <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>.
|
|
|
9 |
|
10 |
#from diffusers import StableDiffusionPipeline
|
11 |
|
12 |
+
whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
|
13 |
+
stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
|
14 |
### ββββββββββββββββββββββββββββββββββββββββ
|
15 |
|
16 |
title="Draw Me an Insect π /Dessine-moi un insecte π"
|
|
|
33 |
|
34 |
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
|
35 |
|
36 |
+
whisper_results = translate_better(audio)
|
37 |
+
prompt = whisper_results[1]
|
38 |
images = get_images(prompt)
|
39 |
|
40 |
+
return whisper_results[0], whisper_results[1], images
|
41 |
|
42 |
#def diffuse(prompt, guidance_scale, nb_iterations, seed):
|
43 |
#
|
|
|
76 |
#
|
77 |
# return images
|
78 |
|
79 |
+
def translate_better(audio):
|
80 |
print("""
|
81 |
β
|
82 |
Sending audio to Whisper ...
|
83 |
β
|
84 |
""")
|
85 |
+
transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
|
86 |
+
translate_text_result = whisper(audio, None, "translate", fn_index=0)
|
87 |
+
print("transcript: " + transcribe_text_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
print("βββββββββββββββββββββββββββββββββββββββββββ")
|
89 |
+
print("translated: " + translate_text_result)
|
90 |
+
|
91 |
+
return transcribe_text_result, translate_text_result
|
|
|
|
|
|
|
92 |
|
93 |
### ββββββββββββββββββββββββββββββββββββββββ
|
94 |
|
|
|
275 |
|
276 |
"""
|
277 |
)
|
278 |
+
with gr.Tab(label="Record/Enregistrer", elem_id="record_tab"):
|
|
|
279 |
with gr.Column():
|
280 |
record_input = gr.Audio(
|
281 |
source="microphone",
|
|
|
299 |
audio_u_translate = gr.Button("Check the transcription/VΓ©rifier la transcription π", elem_id="check_btn_2")
|
300 |
audio_u_direct_sd = gr.Button("Generate the image right now! / GΓ©nerer l'image directement! ποΈ", elem_id="magic_btn_2")
|
301 |
|
302 |
+
|
303 |
with gr.Accordion(label="Image generation Settings/Configuration de gΓ©nΓ©ration d'image", elem_id="sd_settings", visible=False):
|
304 |
with gr.Row():
|
305 |
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
|
|
|
315 |
with gr.Row():
|
316 |
|
317 |
transcripted_output = gr.Textbox(
|
318 |
+
label="Transcription in your detected spoken language",
|
319 |
lines=3,
|
320 |
elem_id="transcripted"
|
321 |
)
|
322 |
+
#language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
|
323 |
|
324 |
with gr.Column():
|
325 |
translated_output = gr.Textbox(
|
326 |
+
label="Transcript translated in English by Whisper",
|
327 |
lines=4,
|
328 |
elem_id="translated"
|
329 |
)
|
330 |
with gr.Row():
|
331 |
clear_btn = gr.Button(value="Clear")
|
332 |
+
diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn")
|
333 |
|
334 |
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
|
335 |
|
|
|
387 |
|
388 |
""", elem_id="about")
|
389 |
|
390 |
+
audio_r_translate.click(translate_better,
|
391 |
inputs = record_input,
|
392 |
outputs = [
|
393 |
+
#language_detected_output,
|
394 |
transcripted_output,
|
395 |
translated_output
|
396 |
])
|
397 |
|
398 |
+
audio_u_translate.click(translate_better,
|
399 |
inputs = upload_input,
|
400 |
outputs = [
|
401 |
+
#language_detected_output,
|
402 |
transcripted_output,
|
403 |
translated_output
|
404 |
])
|
|
|
411 |
seed
|
412 |
],
|
413 |
outputs = [
|
414 |
+
#language_detected_output,
|
415 |
transcripted_output,
|
416 |
translated_output,
|
417 |
sd_output
|
|
|
425 |
seed
|
426 |
],
|
427 |
outputs = [
|
428 |
+
#language_detected_output,
|
429 |
transcripted_output,
|
430 |
translated_output,
|
431 |
sd_output
|
|
|
436 |
translated_output
|
437 |
],
|
438 |
outputs = sd_output
|
439 |
+
)
|
440 |
gr.HTML('''
|
441 |
<div class="footer">
|
442 |
<p> This Space is based on the <a href="https://huggingface.co/spaces/fffiloni/whisper-to-stable-diffusion" target="_blank">Whisper to Stable Diffusion Space</a> created by <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>.
|