Update app.py
Browse files
app.py
CHANGED
@@ -21,13 +21,25 @@ import torch
|
|
21 |
|
22 |
# is cuda available?
|
23 |
|
24 |
-
from easynmt import EasyNMT
|
25 |
-
translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60)
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
|
29 |
|
30 |
-
|
31 |
"Afrikaans":"af",
|
32 |
"Amharic":"am",
|
33 |
"Arabic":"ar",
|
@@ -127,15 +139,44 @@ translation_models = {
|
|
127 |
"Yiddish":"yi",
|
128 |
"Yoruba":"yo",
|
129 |
"Chinese":"zh",
|
130 |
-
"Zulu":"zu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
}
|
132 |
|
133 |
-
translation_models_list = [key[0] for key in translation_models.items()]
|
134 |
|
|
|
|
|
135 |
|
136 |
-
device = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
137 |
-
print("DEVICE IS: ")
|
138 |
-
print(device)
|
139 |
|
140 |
videos_out_path = Path("./videos_out")
|
141 |
videos_out_path.mkdir(parents=True, exist_ok=True)
|
@@ -148,18 +189,19 @@ def get_youtube(video_url):
|
|
148 |
|
149 |
return abs_video_path
|
150 |
|
151 |
-
async def speech_to_text(video_file_path, selected_translation_lang):
|
152 |
"""
|
153 |
# Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
|
154 |
# Currently supports only English audio
|
155 |
This space allows you to:
|
156 |
1. Download youtube video with a given url
|
157 |
2. Watch it in the first video component
|
158 |
-
3. Run automatic speech recognition on the video using Whisper
|
159 |
-
4. Translate the recognized transcriptions to
|
160 |
5. Burn the translations to the original video and watch the video in the 2nd video component
|
161 |
|
162 |
-
Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
|
|
|
163 |
"""
|
164 |
|
165 |
if(video_file_path == None):
|
@@ -193,20 +235,48 @@ async def speech_to_text(video_file_path, selected_translation_lang):
|
|
193 |
}
|
194 |
df = df.append(new_row, ignore_index=True)
|
195 |
|
196 |
-
if selected_translation_lang is None:
|
197 |
-
selected_translation_lang = 'Finnish'
|
198 |
-
|
199 |
-
sentences = df['text']
|
200 |
-
df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang), max_new_tokens = 50)
|
201 |
-
|
202 |
-
|
203 |
-
print('After translation to target language \n')
|
204 |
-
|
205 |
return (df)
|
206 |
except Exception as e:
|
207 |
raise RuntimeError("Error Running inference with local model", e)
|
208 |
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
def create_srt_and_burn(df, video_in):
|
211 |
|
212 |
print("Starting creation of video wit srt")
|
@@ -286,10 +356,12 @@ video_out = gr.Video(label="Video Out", mirror_webcam=False)
|
|
286 |
|
287 |
|
288 |
df_init = pd.DataFrame(columns=['start','end','text','translation'])
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
|
|
|
|
|
293 |
|
294 |
demo = gr.Blocks(css='''
|
295 |
#cut_btn, #reset_btn { align-self:stretch; }
|
@@ -306,14 +378,14 @@ with demo:
|
|
306 |
### This space allows you to:
|
307 |
##### 1. Download youtube video with a given URL
|
308 |
##### 2. Watch it in the first video component
|
309 |
-
##### 3. Run automatic speech recognition on the video using Whisper
|
310 |
-
##### 4. Translate the recognized transcriptions to
|
311 |
##### 5. Burn the translations to the original video and watch the video in the 2nd video component
|
312 |
''')
|
313 |
|
314 |
with gr.Column():
|
315 |
gr.Markdown('''
|
316 |
-
### 1. Insert Youtube URL below
|
317 |
##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
|
318 |
##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
|
319 |
##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
|
@@ -334,20 +406,17 @@ with demo:
|
|
334 |
with gr.Column():
|
335 |
gr.Markdown('''
|
336 |
##### Here you can start the transcription and translation process.
|
337 |
-
##### Be aware that processing will last
|
338 |
''')
|
339 |
-
|
340 |
-
|
341 |
-
transcribe_btn.
|
342 |
-
|
343 |
-
|
344 |
-
with gr.Column():
|
345 |
-
selected_translation_lang.render()
|
346 |
|
347 |
with gr.Row():
|
348 |
gr.Markdown('''
|
349 |
-
##### Here you will get transcription
|
350 |
-
##### If you see error please remember to select translation language
|
351 |
##### ''')
|
352 |
|
353 |
with gr.Row():
|
@@ -356,12 +425,25 @@ with demo:
|
|
356 |
|
357 |
with gr.Row():
|
358 |
with gr.Column():
|
359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
print(video_in)
|
361 |
-
translate_and_make_srt_btn.click(create_srt_and_burn, [
|
362 |
video_out])
|
363 |
video_out.render()
|
364 |
|
365 |
|
366 |
-
|
367 |
-
demo.queue().launch(debug=True, share=False, enable_queue=True)
|
|
|
21 |
|
22 |
# is cuda available?
|
23 |
|
|
|
|
|
24 |
|
25 |
+
num_cores = psutil.cpu_count()
|
26 |
+
os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
|
27 |
+
headers = {'Authorization': os.environ['DeepL_API_KEY']}
|
28 |
+
|
29 |
+
device = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
30 |
+
print("DEVICE IS: ")
|
31 |
+
print(device)
|
32 |
+
|
33 |
+
asr_model_base = whisper.load_model("base", device=device)
|
34 |
+
asr_model_small = whisper.load_model("small", device=device)
|
35 |
+
whisper_models = {
|
36 |
+
'base': asr_model_base,
|
37 |
+
'small': asr_model_small
|
38 |
+
}
|
39 |
+
|
40 |
transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
|
41 |
|
42 |
+
source_languages = {
|
43 |
"Afrikaans":"af",
|
44 |
"Amharic":"am",
|
45 |
"Arabic":"ar",
|
|
|
139 |
"Yiddish":"yi",
|
140 |
"Yoruba":"yo",
|
141 |
"Chinese":"zh",
|
142 |
+
"Zulu":"zu",
|
143 |
+
"Let the model analyze": "Let the model analyze"
|
144 |
+
}
|
145 |
+
|
146 |
+
DeepL_language_codes_for_translation = {
|
147 |
+
"Bulgarian": "BG",
|
148 |
+
"Czech": "CS",
|
149 |
+
"Danish": "DA",
|
150 |
+
"German": "DE",
|
151 |
+
"Greek": "EL",
|
152 |
+
"English": "EN",
|
153 |
+
"Spanish": "ES",
|
154 |
+
"Estonian": "ET",
|
155 |
+
"Finnish": "FI",
|
156 |
+
"French": "FR",
|
157 |
+
"Hungarian": "HU",
|
158 |
+
"Indonesian": "ID",
|
159 |
+
"Italian": "IT",
|
160 |
+
"Japanese": "JA",
|
161 |
+
"Lithuanian": "LT",
|
162 |
+
"Latvian": "LV",
|
163 |
+
"Dutch": "NL",
|
164 |
+
"Polish": "PL",
|
165 |
+
"Portuguese": "PT",
|
166 |
+
"Romanian": "RO",
|
167 |
+
"Russian": "RU",
|
168 |
+
"Slovak": "SK",
|
169 |
+
"Slovenian": "SL",
|
170 |
+
"Swedish": "SV",
|
171 |
+
"Turkish": "TR",
|
172 |
+
"Ukrainian": "UK",
|
173 |
+
"Chinese": "ZH"
|
174 |
}
|
175 |
|
|
|
176 |
|
177 |
+
source_language_list = [key[0] for key in source_languages.items()]
|
178 |
+
translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()]
|
179 |
|
|
|
|
|
|
|
180 |
|
181 |
videos_out_path = Path("./videos_out")
|
182 |
videos_out_path.mkdir(parents=True, exist_ok=True)
|
|
|
189 |
|
190 |
return abs_video_path
|
191 |
|
192 |
+
async def speech_to_text(video_file_path, selected_translation_lang, whisper_model):
|
193 |
"""
|
194 |
# Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
|
195 |
# Currently supports only English audio
|
196 |
This space allows you to:
|
197 |
1. Download youtube video with a given url
|
198 |
2. Watch it in the first video component
|
199 |
+
3. Run automatic speech recognition on the video using fast Whisper models
|
200 |
+
4. Translate the recognized transcriptions to 26 languages supported by deepL (If source language not supported this will return original transciption)
|
201 |
5. Burn the translations to the original video and watch the video in the 2nd video component
|
202 |
|
203 |
+
Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
|
204 |
+
This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
|
205 |
"""
|
206 |
|
207 |
if(video_file_path == None):
|
|
|
235 |
}
|
236 |
df = df.append(new_row, ignore_index=True)
|
237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
return (df)
|
239 |
except Exception as e:
|
240 |
raise RuntimeError("Error Running inference with local model", e)
|
241 |
|
242 |
|
243 |
+
|
244 |
+
def translate_transcriptions(df, selected_translation_lang_2):
|
245 |
+
if selected_translation_lang_2 is None:
|
246 |
+
selected_translation_lang_2 = 'English'
|
247 |
+
df.reset_index(inplace=True)
|
248 |
+
|
249 |
+
print("start_translation")
|
250 |
+
translations = []
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
text_combined = ""
|
255 |
+
for i, sentence in enumerate(df['text']):
|
256 |
+
if i == 0:
|
257 |
+
text_combined = sentence
|
258 |
+
else:
|
259 |
+
text_combined = text_combined + '\n' + sentence
|
260 |
+
|
261 |
+
data = {'text': text_combined,
|
262 |
+
'tag_spitting': 'xml',
|
263 |
+
'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2)
|
264 |
+
}
|
265 |
+
try:
|
266 |
+
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
|
267 |
+
|
268 |
+
# Print the response from the server
|
269 |
+
translated_sentences = json.loads(response.text)
|
270 |
+
translated_sentences = translated_sentences['translations'][0]['text'].split('\n')
|
271 |
+
df['translation'] = translated_sentences
|
272 |
+
except Exception as e:
|
273 |
+
print(e)
|
274 |
+
df['translation'] = df['text']
|
275 |
+
|
276 |
+
print("translations done")
|
277 |
+
|
278 |
+
return df
|
279 |
+
|
280 |
def create_srt_and_burn(df, video_in):
|
281 |
|
282 |
print("Starting creation of video wit srt")
|
|
|
356 |
|
357 |
|
358 |
df_init = pd.DataFrame(columns=['start','end','text','translation'])
|
359 |
+
selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True)
|
360 |
+
selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
|
361 |
+
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
|
362 |
|
363 |
+
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
|
364 |
+
transcription_and_translation_df = gr.DataFrame(value=df_init,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
|
365 |
|
366 |
demo = gr.Blocks(css='''
|
367 |
#cut_btn, #reset_btn { align-self:stretch; }
|
|
|
378 |
### This space allows you to:
|
379 |
##### 1. Download youtube video with a given URL
|
380 |
##### 2. Watch it in the first video component
|
381 |
+
##### 3. Run automatic speech recognition on the video using Whisper
|
382 |
+
##### 4. Translate the recognized transcriptions to 26 languages supported by deepL
|
383 |
##### 5. Burn the translations to the original video and watch the video in the 2nd video component
|
384 |
''')
|
385 |
|
386 |
with gr.Column():
|
387 |
gr.Markdown('''
|
388 |
+
### 1. Insert Youtube URL below. Some test videos below:
|
389 |
##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
|
390 |
##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
|
391 |
##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
|
|
|
406 |
with gr.Column():
|
407 |
gr.Markdown('''
|
408 |
##### Here you can start the transcription and translation process.
|
409 |
+
##### Be aware that processing will last some time. With base model it is around 3x speed
|
410 |
''')
|
411 |
+
selected_source_lang.render()
|
412 |
+
selected_whisper_model.render()
|
413 |
+
transcribe_btn = gr.Button("Step 2. Transcribe audio")
|
414 |
+
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df)
|
415 |
+
|
|
|
|
|
416 |
|
417 |
with gr.Row():
|
418 |
gr.Markdown('''
|
419 |
+
##### Here you will get transcription output
|
|
|
420 |
##### ''')
|
421 |
|
422 |
with gr.Row():
|
|
|
425 |
|
426 |
with gr.Row():
|
427 |
with gr.Column():
|
428 |
+
gr.Markdown('''
|
429 |
+
##### Here you will get translated transcriptions.
|
430 |
+
##### Please remember to select target language
|
431 |
+
##### ''')
|
432 |
+
selected_translation_lang_2.render()
|
433 |
+
translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
|
434 |
+
translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], transcription_and_translation_df)
|
435 |
+
transcription_and_translation_df.render()
|
436 |
+
|
437 |
+
with gr.Row():
|
438 |
+
with gr.Column():
|
439 |
+
gr.Markdown('''
|
440 |
+
##### Now press the Step 4. Button to create output video with translated transcriptions
|
441 |
+
##### ''')
|
442 |
+
translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video")
|
443 |
print(video_in)
|
444 |
+
translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [
|
445 |
video_out])
|
446 |
video_out.render()
|
447 |
|
448 |
|
449 |
+
demo.launch()
|
|