fffiloni commited on
Commit
d50061a
β€’
1 Parent(s): 3a7d7b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -42
app.py CHANGED
@@ -1,14 +1,15 @@
1
  import gradio as gr
2
  #import torch
3
- import whisper
4
- from datetime import datetime
5
  from PIL import Image
6
- import flag
7
  import os
8
  #MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
9
 
10
  #from diffusers import StableDiffusionPipeline
11
 
 
12
  stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
13
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
14
 
@@ -16,7 +17,7 @@ title="Whisper to Stable Diffusion"
16
 
17
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
18
 
19
- whisper_model = whisper.load_model("small")
20
 
21
  #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
 
@@ -32,8 +33,8 @@ def get_images(prompt):
32
 
33
  def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
34
 
35
- whisper_results = translate(audio)
36
- prompt = whisper_results[2]
37
  images = get_images(prompt)
38
 
39
  return whisper_results[0], whisper_results[1], whisper_results[2], images
@@ -75,46 +76,61 @@ def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
75
  #
76
  # return images
77
 
78
- def translate(audio):
79
  print("""
80
  β€”
81
  Sending audio to Whisper ...
82
  β€”
83
  """)
84
- # current dateTime
85
- now = datetime.now()
86
- # convert to string
87
- date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
88
- print('DateTime String:', date_time_str)
89
-
90
- audio = whisper.load_audio(audio)
91
- audio = whisper.pad_or_trim(audio)
92
-
93
- mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
94
-
95
- _, probs = whisper_model.detect_language(mel)
96
-
97
- transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
98
- translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
99
-
100
- transcription = whisper.decode(whisper_model, mel, transcript_options)
101
- translation = whisper.decode(whisper_model, mel, translate_options)
102
-
103
- print("language spoken: " + transcription.language)
104
- print("transcript: " + transcription.text)
105
  print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
106
- print("translated: " + translation.text)
107
- if transcription.language == "en":
108
- tr_flag = flag.flag('GB')
109
- else:
110
- tr_flag = flag.flag(transcription.language)
111
- return tr_flag, transcription.text, translation.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
114
 
115
  css = """
116
  .container {
117
- max-width: 880px;
118
  margin: auto;
119
  padding-top: 1.5rem;
120
  }
@@ -339,7 +355,7 @@ with gr.Blocks(css=css) as demo:
339
  lines=3,
340
  elem_id="transcripted"
341
  )
342
- language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
343
 
344
  with gr.Column():
345
  translated_output = gr.Textbox(
@@ -406,18 +422,18 @@ with gr.Blocks(css=css) as demo:
406
 
407
  """, elem_id="about")
408
 
409
- audio_r_translate.click(translate,
410
  inputs = record_input,
411
  outputs = [
412
- language_detected_output,
413
  transcripted_output,
414
  translated_output
415
  ])
416
 
417
- audio_u_translate.click(translate,
418
  inputs = upload_input,
419
  outputs = [
420
- language_detected_output,
421
  transcripted_output,
422
  translated_output
423
  ])
@@ -430,7 +446,7 @@ with gr.Blocks(css=css) as demo:
430
  seed
431
  ],
432
  outputs = [
433
- language_detected_output,
434
  transcripted_output,
435
  translated_output,
436
  sd_output
@@ -444,7 +460,7 @@ with gr.Blocks(css=css) as demo:
444
  seed
445
  ],
446
  outputs = [
447
- language_detected_output,
448
  transcripted_output,
449
  translated_output,
450
  sd_output
 
1
  import gradio as gr
2
  #import torch
3
+ #import whisper
4
+ #from datetime import datetime
5
  from PIL import Image
6
+ #import flag
7
  import os
8
  #MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
9
 
10
  #from diffusers import StableDiffusionPipeline
11
 
12
+ whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
13
  stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
14
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
15
 
 
17
 
18
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
19
 
20
+ #whisper_model = whisper.load_model("small")
21
 
22
  #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
 
 
33
 
34
  def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
35
 
36
+ whisper_results = translate_better(audio)
37
+ prompt = whisper_results[1]
38
  images = get_images(prompt)
39
 
40
  return whisper_results[0], whisper_results[1], whisper_results[2], images
 
76
  #
77
  # return images
78
 
79
+ def translate_better(audio):
80
  print("""
81
  β€”
82
  Sending audio to Whisper ...
83
  β€”
84
  """)
85
+ transcribe_text_result = whisper(audio, None, "transcribe", fn_index=0)
86
+ translate_text_result = whisper(audio, None, "translate", fn_index=0)
87
+ print("transcript: " + transcribe_text_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
89
+ print("translated: " + translate_text_result)
90
+
91
+ return transcribe_text_result, translate_text_result
92
+
93
+
94
+ #def translate(audio):
95
+ # print("""
96
+ # β€”
97
+ # Sending audio to Whisper ...
98
+ # β€”
99
+ # """)
100
+ # # current dateTime
101
+ # now = datetime.now()
102
+ # # convert to string
103
+ # date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
104
+ # print('DateTime String:', date_time_str)
105
+ #
106
+ # audio = whisper.load_audio(audio)
107
+ # audio = whisper.pad_or_trim(audio)
108
+ #
109
+ # mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
110
+ #
111
+ # _, probs = whisper_model.detect_language(mel)
112
+ #
113
+ # transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
114
+ # translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
115
+ #
116
+ # transcription = whisper.decode(whisper_model, mel, transcript_options)
117
+ # translation = whisper.decode(whisper_model, mel, translate_options)
118
+ #
119
+ # print("language spoken: " + transcription.language)
120
+ # print("transcript: " + transcription.text)
121
+ # print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
122
+ # print("translated: " + translation.text)
123
+ # if transcription.language == "en":
124
+ # tr_flag = flag.flag('GB')
125
+ # else:
126
+ # tr_flag = flag.flag(transcription.language)
127
+ # return tr_flag, transcription.text, translation.text
128
 
129
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
130
 
131
  css = """
132
  .container {
133
+ max-width: 780px;
134
  margin: auto;
135
  padding-top: 1.5rem;
136
  }
 
355
  lines=3,
356
  elem_id="transcripted"
357
  )
358
+ #language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
359
 
360
  with gr.Column():
361
  translated_output = gr.Textbox(
 
422
 
423
  """, elem_id="about")
424
 
425
+ audio_r_translate.click(translate_better,
426
  inputs = record_input,
427
  outputs = [
428
+ #language_detected_output,
429
  transcripted_output,
430
  translated_output
431
  ])
432
 
433
+ audio_u_translate.click(translate_better,
434
  inputs = upload_input,
435
  outputs = [
436
+ #language_detected_output,
437
  transcripted_output,
438
  translated_output
439
  ])
 
446
  seed
447
  ],
448
  outputs = [
449
+ #language_detected_output,
450
  transcripted_output,
451
  translated_output,
452
  sd_output
 
460
  seed
461
  ],
462
  outputs = [
463
+ #language_detected_output,
464
  transcripted_output,
465
  translated_output,
466
  sd_output