mikegarts commited on
Commit
665f810
1 Parent(s): 737295b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -11
app.py CHANGED
@@ -21,6 +21,9 @@ import pandas as pd
21
  import re
22
  import time
23
 
 
 
 
24
  from pytube import YouTube
25
  import torch
26
 
@@ -28,11 +31,13 @@ INTRO_MSG = '''
28
  #### <p>There are many not very widely spoken languages for which it is quite hard to find learning materials,
29
  especially well dubbed videos (target language video with target language subs).
30
  This tool will hopefully transcribe and add subs to your videos.
31
- At least for me this is a nice tool to practice both listening and reading skills.
 
32
  <p>Speech Recognition is based on models from OpenAI Whisper - https://github.com/openai/whisper
33
  <p> This space is using the c++ implementation by https://github.com/ggerganov/whisper.cpp
34
  '''
35
 
 
36
  whisper_models = MODELS_TO_DOWNLOAD #["medium"]#["base", "small", "medium", "large", "base.en"]
37
 
38
  custom_models = []
@@ -42,6 +47,104 @@ combined_models.extend(custom_models)
42
 
43
  LANGUAGES = {
44
  "bg": "Bulgarian",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
 
47
  # language code lookup by name, with a few language aliases
@@ -60,7 +163,27 @@ def get_youtube(video_url):
60
  print(f"Download complete - {abs_video_path}")
61
  return abs_video_path
62
 
63
- def speech_to_text(video_file_path, selected_source_lang, whisper_model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  """
65
  Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
66
  This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
@@ -90,8 +213,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
90
  try:
91
  print("starting whisper c++")
92
  os.system(f'rm -f {srt_path}')
93
- print('Running regular model')
94
- os.system(f'./whisper.cpp/main "{input_wav_file}" -t {os.cpu_count()} -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt -ovtt')
 
95
  print("whisper c++ finished")
96
  except Exception as e:
97
  raise RuntimeError("Error running Whisper cpp model")
@@ -149,6 +273,7 @@ subtitle_files = gr.File(
149
  video_player = gr.HTML('<p>video will be played here')
150
  eventslider = gr.Slider(visible=False)
151
  status_msg = gr.Markdown('Status')
 
152
 
153
  demo = gr.Blocks()
154
  demo.encrypt = False
@@ -157,24 +282,26 @@ def set_app_msg(app_state, msg):
157
  app_state['status_msg'] = msg
158
 
159
  def transcribe(app_state, youtube_url_in, selected_source_lang, selected_whisper_model):
 
160
  set_app_msg(app_state, 'Downloading the movie ...')
161
  video_file_path = get_youtube(youtube_url_in)
162
  set_app_msg(app_state, f'Running the speech to text model {selected_source_lang}/{selected_whisper_model}. This can take some time.')
163
- subtitle_files = speech_to_text(video_file_path, selected_source_lang, selected_whisper_model)
164
  set_app_msg(app_state, f'Creating the video player ...')
165
  video_player = create_video_player(subtitle_files, video_file_path)
166
- set_app_msg(app_state, f'Transcribing done, generating video player ...')
167
  return subtitle_files, video_player
168
 
169
 
170
  def on_change_event(app_state):
171
- print('Running!')
172
- return app_state['status_msg']
173
 
174
  with demo:
175
  app_state = gr.State({
176
- 'running':False,
177
- 'status_msg': ''
 
178
  })
179
 
180
  with gr.Row():
@@ -196,12 +323,13 @@ with demo:
196
 
197
  eventslider.render()
198
  status_msg.render()
 
199
  subtitle_files.render()
200
  video_player.render()
201
  with gr.Row():
202
  gr.Markdown('This app is based on [this code](https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles/tree/main) by RASMUS.')
203
 
204
- dep = demo.load(on_change_event, inputs=[app_state], outputs=[status_msg], every=10)
205
 
206
 
207
  #### RUN ###
 
21
  import re
22
  import time
23
 
24
+ import subprocess
25
+ import shlex
26
+
27
  from pytube import YouTube
28
  import torch
29
 
 
31
  #### <p>There are many not very widely spoken languages for which it is quite hard to find learning materials,
32
  especially well dubbed videos (target language video with target language subs).
33
  This tool will hopefully transcribe and add subs to your videos.
34
+ At least for me this is a nice tool to practice both listening and reading skills.
35
+ This is a 'one-click' variant of similar spaces found here on the HF hub.
36
  <p>Speech Recognition is based on models from OpenAI Whisper - https://github.com/openai/whisper
37
  <p> This space is using the c++ implementation by https://github.com/ggerganov/whisper.cpp
38
  '''
39
 
40
+
41
  whisper_models = MODELS_TO_DOWNLOAD #["medium"]#["base", "small", "medium", "large", "base.en"]
42
 
43
  custom_models = []
 
47
 
48
  LANGUAGES = {
49
  "bg": "Bulgarian",
50
+ "en": "English",
51
+ "zh": "Chinese",
52
+ "de": "German",
53
+ "es": "Spanish",
54
+ "ru": "Russian",
55
+ "ko": "Korean",
56
+ "fr": "French",
57
+ "ja": "Japanese",
58
+ "pt": "Portuguese",
59
+ "tr": "Turkish",
60
+ "pl": "Polish",
61
+ "ca": "Catalan",
62
+ "nl": "Dutch",
63
+ "ar": "Arabic",
64
+ "sv": "Swedish",
65
+ "it": "Italian",
66
+ "id": "Indonesian",
67
+ "hi": "Hindi",
68
+ "fi": "Finnish",
69
+ "vi": "Vietnamese",
70
+ "he": "Hebrew",
71
+ "uk": "Ukrainian",
72
+ "el": "Greek",
73
+ "ms": "Malay",
74
+ "cs": "Czech",
75
+ "ro": "Romanian",
76
+ "da": "Danish",
77
+ "hu": "Hungarian",
78
+ "ta": "Tamil",
79
+ "no": "Norwegian",
80
+ "th": "Thai",
81
+ "ur": "Urdu",
82
+ "hr": "Croatian",
83
+ "lt": "Lithuanian",
84
+ "la": "Latin",
85
+ "mi": "Maori",
86
+ "ml": "Malayalam",
87
+ "cy": "Welsh",
88
+ "sk": "Slovak",
89
+ "te": "Telugu",
90
+ "fa": "Persian",
91
+ "lv": "Latvian",
92
+ "bn": "Bengali",
93
+ "sr": "Serbian",
94
+ "az": "Azerbaijani",
95
+ "sl": "Slovenian",
96
+ "kn": "Kannada",
97
+ "et": "Estonian",
98
+ "mk": "Macedonian",
99
+ "br": "Breton",
100
+ "eu": "Basque",
101
+ "is": "Icelandic",
102
+ "hy": "Armenian",
103
+ "ne": "Nepali",
104
+ "mn": "Mongolian",
105
+ "bs": "Bosnian",
106
+ "kk": "Kazakh",
107
+ "sq": "Albanian",
108
+ "sw": "Swahili",
109
+ "gl": "Galician",
110
+ "mr": "Marathi",
111
+ "pa": "Punjabi",
112
+ "si": "Sinhala",
113
+ "km": "Khmer",
114
+ "sn": "Shona",
115
+ "yo": "Yoruba",
116
+ "so": "Somali",
117
+ "af": "Afrikaans",
118
+ "oc": "Occitan",
119
+ "ka": "Georgian",
120
+ "be": "Belarusian",
121
+ "tg": "Tajik",
122
+ "sd": "Sindhi",
123
+ "gu": "Gujarati",
124
+ "am": "Amharic",
125
+ "yi": "Yiddish",
126
+ "lo": "Lao",
127
+ "uz": "Uzbek",
128
+ "fo": "Faroese",
129
+ "ht": "Haitian creole",
130
+ "ps": "Pashto",
131
+ "tk": "Turkmen",
132
+ "nn": "Nynorsk",
133
+ "mt": "Maltese",
134
+ "sa": "Sanskrit",
135
+ "lb": "Luxembourgish",
136
+ "my": "Myanmar",
137
+ "bo": "Tibetan",
138
+ "tl": "Tagalog",
139
+ "mg": "Malagasy",
140
+ "as": "Assamese",
141
+ "tt": "Tatar",
142
+ "haw": "Hawaiian",
143
+ "ln": "Lingala",
144
+ "ha": "Hausa",
145
+ "ba": "Bashkir",
146
+ "jw": "Javanese",
147
+ "su": "Sundanese",
148
  }
149
 
150
  # language code lookup by name, with a few language aliases
 
163
  print(f"Download complete - {abs_video_path}")
164
  return abs_video_path
165
 
166
+ def run_command(command, app_state):
167
+ print(command)
168
+ process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
169
+ while process.poll() is None:
170
+ time.sleep(5)
171
+ output = process.stdout.readline()
172
+ if output == '' and process.poll() is not None:
173
+ break
174
+ if output:
175
+ decoded = output.decode()
176
+ print(decoded)
177
+ app_state['output'] += decoded
178
+
179
+ rc = process.poll()
180
+ print(f'{cmd} ret code is {rc}')
181
+ return rc
182
+
183
+ def speech_to_text(video_file_path,
184
+ selected_source_lang,
185
+ whisper_model,
186
+ app_state):
187
  """
188
  Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
189
  This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
 
213
  try:
214
  print("starting whisper c++")
215
  os.system(f'rm -f {srt_path}')
216
+ run_command(f'./whisper.cpp/main "{input_wav_file}" -t {os.cpu_count()} -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt -ovtt',
217
+ app_state)
218
+ # os.system(f'./whisper.cpp/main "{input_wav_file}" -t {os.cpu_count()} -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt -ovtt')
219
  print("whisper c++ finished")
220
  except Exception as e:
221
  raise RuntimeError("Error running Whisper cpp model")
 
273
  video_player = gr.HTML('<p>video will be played here')
274
  eventslider = gr.Slider(visible=False)
275
  status_msg = gr.Markdown('Status')
276
+ output_label = gr.Textbox('', interactive=False, show_label=False)
277
 
278
  demo = gr.Blocks()
279
  demo.encrypt = False
 
282
  app_state['status_msg'] = msg
283
 
284
  def transcribe(app_state, youtube_url_in, selected_source_lang, selected_whisper_model):
285
+ app_state['output'] = ''
286
  set_app_msg(app_state, 'Downloading the movie ...')
287
  video_file_path = get_youtube(youtube_url_in)
288
  set_app_msg(app_state, f'Running the speech to text model {selected_source_lang}/{selected_whisper_model}. This can take some time.')
289
+ subtitle_files = speech_to_text(video_file_path, selected_source_lang, selected_whisper_model, app_state)
290
  set_app_msg(app_state, f'Creating the video player ...')
291
  video_player = create_video_player(subtitle_files, video_file_path)
292
+ set_app_msg(app_state, f'Transcribing done, generating video player')
293
  return subtitle_files, video_player
294
 
295
 
296
  def on_change_event(app_state):
297
+ print(f'Running! {app_state}')
298
+ return app_state['status_msg'], app_state['output']
299
 
300
  with demo:
301
  app_state = gr.State({
302
+ 'running': False,
303
+ 'status_msg': '',
304
+ 'output': ''
305
  })
306
 
307
  with gr.Row():
 
323
 
324
  eventslider.render()
325
  status_msg.render()
326
+ output_label.render()
327
  subtitle_files.render()
328
  video_player.render()
329
  with gr.Row():
330
  gr.Markdown('This app is based on [this code](https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles/tree/main) by RASMUS.')
331
 
332
+ dep = demo.load(on_change_event, inputs=[app_state], outputs=[status_msg, output_label], every=10)
333
 
334
 
335
  #### RUN ###