sasha HF staff commited on
Commit
54bab5e
β€’
1 Parent(s): 1ac8323

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -45
app.py CHANGED
@@ -9,7 +9,8 @@ import os
9
 
10
  #from diffusers import StableDiffusionPipeline
11
 
12
- stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion")
 
13
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
14
 
15
  title="Draw Me an Insect 🐞 /Dessine-moi un insecte 🐞"
@@ -32,11 +33,11 @@ def get_images(prompt):
32
 
33
  def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
34
 
35
- whisper_results = translate(audio)
36
- prompt = whisper_results[2]
37
  images = get_images(prompt)
38
 
39
- return whisper_results[0], whisper_results[1], whisper_results[2], images
40
 
41
  #def diffuse(prompt, guidance_scale, nb_iterations, seed):
42
  #
@@ -75,40 +76,19 @@ def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
75
  #
76
  # return images
77
 
78
- def translate(audio):
79
  print("""
80
  β€”
81
  Sending audio to Whisper ...
82
  β€”
83
  """)
84
- # current dateTime
85
- now = datetime.now()
86
- # convert to string
87
- date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
88
- print('DateTime String:', date_time_str)
89
-
90
- audio = whisper.load_audio(audio)
91
- audio = whisper.pad_or_trim(audio)
92
-
93
- mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
94
-
95
- _, probs = whisper_model.detect_language(mel)
96
-
97
- transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
98
- translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
99
-
100
- transcription = whisper.decode(whisper_model, mel, transcript_options)
101
- translation = whisper.decode(whisper_model, mel, translate_options)
102
-
103
- print("language spoken: " + transcription.language)
104
- print("transcript: " + transcription.text)
105
  print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
106
- print("translated: " + translation.text)
107
- if transcription.language == "en":
108
- tr_flag = flag.flag('GB')
109
- else:
110
- tr_flag = flag.flag(transcription.language)
111
- return tr_flag, transcription.text, translation.text
112
 
113
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
114
 
@@ -295,8 +275,7 @@ with gr.Blocks(css=css) as demo:
295
 
296
  """
297
  )
298
-
299
- with gr.Tab(label="Record/Enregistrer", elem_id="record_tab"):
300
  with gr.Column():
301
  record_input = gr.Audio(
302
  source="microphone",
@@ -320,6 +299,7 @@ with gr.Blocks(css=css) as demo:
320
  audio_u_translate = gr.Button("Check the transcription/VΓ©rifier la transcription πŸ‘", elem_id="check_btn_2")
321
  audio_u_direct_sd = gr.Button("Generate the image right now! / GΓ©nerer l'image directement! πŸ–ŒοΈ", elem_id="magic_btn_2")
322
 
 
323
  with gr.Accordion(label="Image generation Settings/Configuration de gΓ©nΓ©ration d'image", elem_id="sd_settings", visible=False):
324
  with gr.Row():
325
  guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
@@ -335,21 +315,21 @@ with gr.Blocks(css=css) as demo:
335
  with gr.Row():
336
 
337
  transcripted_output = gr.Textbox(
338
- label="Transcription",
339
  lines=3,
340
  elem_id="transcripted"
341
  )
342
- language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
343
 
344
  with gr.Column():
345
  translated_output = gr.Textbox(
346
- label="Transcription in English/ Transcription traduite en anglais",
347
  lines=4,
348
  elem_id="translated"
349
  )
350
  with gr.Row():
351
  clear_btn = gr.Button(value="Clear")
352
- diffuse_btn = gr.Button(value="Generate image! GΓ©nΓ©rer l'image!", elem_id="diffuse_btn")
353
 
354
  clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
355
 
@@ -407,18 +387,18 @@ with gr.Blocks(css=css) as demo:
407
 
408
  """, elem_id="about")
409
 
410
- audio_r_translate.click(translate,
411
  inputs = record_input,
412
  outputs = [
413
- language_detected_output,
414
  transcripted_output,
415
  translated_output
416
  ])
417
 
418
- audio_u_translate.click(translate,
419
  inputs = upload_input,
420
  outputs = [
421
- language_detected_output,
422
  transcripted_output,
423
  translated_output
424
  ])
@@ -431,7 +411,7 @@ with gr.Blocks(css=css) as demo:
431
  seed
432
  ],
433
  outputs = [
434
- language_detected_output,
435
  transcripted_output,
436
  translated_output,
437
  sd_output
@@ -445,7 +425,7 @@ with gr.Blocks(css=css) as demo:
445
  seed
446
  ],
447
  outputs = [
448
- language_detected_output,
449
  transcripted_output,
450
  translated_output,
451
  sd_output
@@ -456,7 +436,7 @@ with gr.Blocks(css=css) as demo:
456
  translated_output
457
  ],
458
  outputs = sd_output
459
- )
460
  gr.HTML('''
461
  <div class="footer">
462
  <p> This Space is based on the <a href="https://huggingface.co/spaces/fffiloni/whisper-to-stable-diffusion" target="_blank">Whisper to Stable Diffusion Space</a> created by <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>.
 
9
 
10
  #from diffusers import StableDiffusionPipeline
11
 
12
+ whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
13
+ stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
14
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
15
 
16
  title="Draw Me an Insect 🐞 /Dessine-moi un insecte 🐞"
 
33
 
34
  def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
35
 
36
+ whisper_results = translate_better(audio)
37
+ prompt = whisper_results[1]
38
  images = get_images(prompt)
39
 
40
+ return whisper_results[0], whisper_results[1], images
41
 
42
  #def diffuse(prompt, guidance_scale, nb_iterations, seed):
43
  #
 
76
  #
77
  # return images
78
 
79
+ def translate_better(audio):
80
  print("""
81
  β€”
82
  Sending audio to Whisper ...
83
  β€”
84
  """)
85
+ transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
86
+ translate_text_result = whisper(audio, None, "translate", fn_index=0)
87
+ print("transcript: " + transcribe_text_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
89
+ print("translated: " + translate_text_result)
90
+
91
+ return transcribe_text_result, translate_text_result
 
 
 
92
 
93
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
94
 
 
275
 
276
  """
277
  )
278
+ with gr.Tab(label="Record/Enregistrer", elem_id="record_tab"):
 
279
  with gr.Column():
280
  record_input = gr.Audio(
281
  source="microphone",
 
299
  audio_u_translate = gr.Button("Check the transcription/VΓ©rifier la transcription πŸ‘", elem_id="check_btn_2")
300
  audio_u_direct_sd = gr.Button("Generate the image right now! / GΓ©nerer l'image directement! πŸ–ŒοΈ", elem_id="magic_btn_2")
301
 
302
+
303
  with gr.Accordion(label="Image generation Settings/Configuration de gΓ©nΓ©ration d'image", elem_id="sd_settings", visible=False):
304
  with gr.Row():
305
  guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
 
315
  with gr.Row():
316
 
317
  transcripted_output = gr.Textbox(
318
+ label="Transcription in your detected spoken language",
319
  lines=3,
320
  elem_id="transcripted"
321
  )
322
+ #language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
323
 
324
  with gr.Column():
325
  translated_output = gr.Textbox(
326
+ label="Transcript translated in English by Whisper",
327
  lines=4,
328
  elem_id="translated"
329
  )
330
  with gr.Row():
331
  clear_btn = gr.Button(value="Clear")
332
+ diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn")
333
 
334
  clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
335
 
 
387
 
388
  """, elem_id="about")
389
 
390
+ audio_r_translate.click(translate_better,
391
  inputs = record_input,
392
  outputs = [
393
+ #language_detected_output,
394
  transcripted_output,
395
  translated_output
396
  ])
397
 
398
+ audio_u_translate.click(translate_better,
399
  inputs = upload_input,
400
  outputs = [
401
+ #language_detected_output,
402
  transcripted_output,
403
  translated_output
404
  ])
 
411
  seed
412
  ],
413
  outputs = [
414
+ #language_detected_output,
415
  transcripted_output,
416
  translated_output,
417
  sd_output
 
425
  seed
426
  ],
427
  outputs = [
428
+ #language_detected_output,
429
  transcripted_output,
430
  translated_output,
431
  sd_output
 
436
  translated_output
437
  ],
438
  outputs = sd_output
439
+ )
440
  gr.HTML('''
441
  <div class="footer">
442
  <p> This Space is based on the <a href="https://huggingface.co/spaces/fffiloni/whisper-to-stable-diffusion" target="_blank">Whisper to Stable Diffusion Space</a> created by <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a>.