|
import gradio as gr |
|
import os |
|
from pathlib import Path |
|
import time |
|
|
|
import pandas as pd |
|
import re |
|
import time |
|
import os |
|
|
|
import whisper |
|
from pytube import YouTube |
|
|
|
import psutil |
|
num_cores = psutil.cpu_count() |
|
os.environ["OMP_NUM_THREADS"] = f"{num_cores}" |
|
|
|
|
|
import torch |
|
|
|
|
|
|
|
|
|
from easynmt import EasyNMT |
|
translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60) |
|
|
|
asr_model = whisper.load_model("base") |
|
transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False) |
|
|
|
translation_models = { |
|
"Afrikaans":"af", |
|
"Amharic":"am", |
|
"Arabic":"ar", |
|
"Asturian ":"st", |
|
"Azerbaijani":"az", |
|
"Bashkir":"ba", |
|
"Belarusian":"be", |
|
"Bulgarian":"bg", |
|
"Bengali":"bn", |
|
"Breton":"br", |
|
"Bosnian":"bs", |
|
"Catalan; Valencian":"ca", |
|
"Cebuano":"eb", |
|
"Czech":"cs", |
|
"Welsh":"cy", |
|
"Danish":"da", |
|
"German":"de", |
|
"Greeek":"el", |
|
"English":"en", |
|
"Spanish":"es", |
|
"Estonian":"et", |
|
"Persian":"fa", |
|
"Fulah":"ff", |
|
"Finnish":"fi", |
|
"French":"fr", |
|
"Western Frisian":"fy", |
|
"Irish":"ga", |
|
"Gaelic; Scottish Gaelic":"gd", |
|
"Galician":"gl", |
|
"Gujarati":"gu", |
|
"Hausa":"ha", |
|
"Hebrew":"he", |
|
"Hindi":"hi", |
|
"Croatian":"hr", |
|
"Haitian; Haitian Creole":"ht", |
|
"Hungarian":"hu", |
|
"Armenian":"hy", |
|
"Indonesian":"id", |
|
"Igbo":"ig", |
|
"Iloko":"lo", |
|
"Icelandic":"is", |
|
"Italian":"it", |
|
"Japanese":"ja", |
|
"Javanese":"jv", |
|
"Georgian":"ka", |
|
"Kazakh":"kk", |
|
"Central Khmer":"km", |
|
"Kannada":"kn", |
|
"Korean":"ko", |
|
"Luxembourgish; Letzeburgesch":"lb", |
|
"Ganda":"lg", |
|
"Lingala":"ln", |
|
"Lao":"lo", |
|
"Lithuanian":"lt", |
|
"Latvian":"lv", |
|
"Malagasy":"mg", |
|
"Macedonian":"mk", |
|
"Malayalam":"ml", |
|
"Mongolian":"mn", |
|
"Marathi":"mr", |
|
"Malay":"ms", |
|
"Burmese":"my", |
|
"Nepali":"ne", |
|
"Dutch; Flemish":"nl", |
|
"Norwegian":"no", |
|
"Northern Sotho":"ns", |
|
"Occitan (post 1500)":"oc", |
|
"Oriya":"or", |
|
"Panjabi; Punjabi":"pa", |
|
"Polish":"pl", |
|
"Pushto; Pashto":"ps", |
|
"Portuguese":"pt", |
|
"Romanian; Moldavian; Moldovan":"ro", |
|
"Russian":"ru", |
|
"Sindhi":"sd", |
|
"Sinhala; Sinhalese":"si", |
|
"Slovak":"sk", |
|
"Slovenian":"sl", |
|
"Somali":"so", |
|
"Albanian":"sq", |
|
"Serbian":"sr", |
|
"Swati":"ss", |
|
"Sundanese":"su", |
|
"Swedish":"sv", |
|
"Swahili":"sw", |
|
"Tamil":"ta", |
|
"Thai":"th", |
|
"Tagalog":"tl", |
|
"Tswana":"tn", |
|
"Turkish":"tr", |
|
"Ukrainian":"uk", |
|
"Urdu":"ur", |
|
"Uzbek":"uz", |
|
"Vietnamese":"vi", |
|
"Wolof":"wo", |
|
"Xhosa":"xh", |
|
"Yiddish":"yi", |
|
"Yoruba":"yo", |
|
"Chinese":"zh", |
|
"Zulu":"zu" |
|
} |
|
|
|
translation_models_list = [key[0] for key in translation_models.items()] |
|
|
|
|
|
device = "cpu" |
|
print("DEVICE IS: ") |
|
print(device) |
|
|
|
videos_out_path = Path("./videos_out") |
|
videos_out_path.mkdir(parents=True, exist_ok=True) |
|
|
|
def get_youtube(video_url): |
|
yt = YouTube(video_url) |
|
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download() |
|
print("LADATATTU POLKUUN") |
|
print(abs_video_path) |
|
|
|
return abs_video_path |
|
|
|
async def speech_to_text(video_file_path, selected_translation_lang): |
|
""" |
|
# Youtube with translated subtitles using OpenAI Whisper and Opus-MT models. |
|
# Currently supports only English audio |
|
This space allows you to: |
|
1. Download youtube video with a given url |
|
2. Watch it in the first video component |
|
3. Run automatic speech recognition on the video using Whisper |
|
4. Translate the recognized transcriptions to Finnish, Swedish, Danish |
|
5. Burn the translations to the original video and watch the video in the 2nd video component |
|
|
|
Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper |
|
""" |
|
|
|
if(video_file_path == None): |
|
raise ValueError("Error no video input") |
|
print(video_file_path) |
|
try: |
|
audio = whisper.load_audio(video_file_path) |
|
except Exception as e: |
|
raise RuntimeError("Error converting video to audio") |
|
|
|
last_time = time.time() |
|
|
|
try: |
|
print(f'Transcribing via local model') |
|
transcribe_options = dict(beam_size=5, best_of=5, without_timestamps=False) |
|
|
|
transcription = asr_model.transcribe(audio, **transcribe_options) |
|
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(columns=['start','end','text']) |
|
|
|
|
|
|
|
for i,segment in enumerate(transcription['segments']): |
|
new_row = {'start': segment['start'], |
|
'end': segment['end'], |
|
'text': segment['text'] |
|
} |
|
df = df.append(new_row, ignore_index=True) |
|
|
|
if selected_translation_lang is None: |
|
selected_translation_lang = 'Finnish' |
|
|
|
sentences = df['text'] |
|
df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang), max_new_tokens = 50) |
|
|
|
|
|
print('After translation to target language \n') |
|
|
|
return (df) |
|
except Exception as e: |
|
raise RuntimeError("Error Running inference with local model", e) |
|
|
|
|
|
def create_srt_and_burn(df, video_in): |
|
|
|
print("Starting creation of video wit srt") |
|
|
|
|
|
with open('testi.srt','w', encoding="utf-8") as file: |
|
for i in range(len(df)): |
|
file.write(str(i+1)) |
|
file.write('\n') |
|
start = df.iloc[i]['start'] |
|
|
|
|
|
milliseconds = round(start * 1000.0) |
|
|
|
hours = milliseconds // 3_600_000 |
|
milliseconds -= hours * 3_600_000 |
|
|
|
minutes = milliseconds // 60_000 |
|
milliseconds -= minutes * 60_000 |
|
|
|
seconds = milliseconds // 1_000 |
|
milliseconds -= seconds * 1_000 |
|
|
|
file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}") |
|
|
|
stop = df.iloc[i]['end'] |
|
|
|
|
|
milliseconds = round(stop * 1000.0) |
|
|
|
hours = milliseconds // 3_600_000 |
|
milliseconds -= hours * 3_600_000 |
|
|
|
minutes = milliseconds // 60_000 |
|
milliseconds -= minutes * 60_000 |
|
|
|
seconds = milliseconds // 1_000 |
|
milliseconds -= seconds * 1_000 |
|
|
|
|
|
file.write(' --> ') |
|
file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}") |
|
file.write('\n') |
|
file.writelines(df.iloc[i]['translation']) |
|
if int(i) != len(df)-1: |
|
file.write('\n\n') |
|
|
|
print("SRT DONE") |
|
try: |
|
file1 = open('./testi.srt', 'r', encoding="utf-8") |
|
Lines = file1.readlines() |
|
|
|
count = 0 |
|
|
|
for line in Lines: |
|
count += 1 |
|
print("{}".format(line)) |
|
|
|
print(type(video_in)) |
|
print(video_in) |
|
|
|
video_out = video_in.replace('.mp4', '_out.mp4') |
|
print(video_out) |
|
command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out) |
|
print(command) |
|
os.system(command) |
|
return video_out |
|
except Exception as e: |
|
print(e) |
|
return video_out |
|
|
|
|
|
|
|
video_in = gr.Video(label="Video file", mirror_webcam=False) |
|
youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) |
|
video_out = gr.Video(label="Video Out", mirror_webcam=False) |
|
|
|
|
|
df_init = pd.DataFrame(columns=['start','end','text','translation']) |
|
selected_translation_lang = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="Language to translate transcriptions to", interactive=True) |
|
|
|
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10) |
|
|
|
|
|
demo = gr.Blocks(css=''' |
|
#cut_btn, #reset_btn { align-self:stretch; } |
|
#\\31 3 { max-width: 540px; } |
|
.output-markdown {max-width: 65ch !important;} |
|
''') |
|
demo.encrypt = False |
|
with demo: |
|
transcription_var = gr.Variable() |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown(''' |
|
### This space allows you to: |
|
##### 1. Download youtube video with a given URL |
|
##### 2. Watch it in the first video component |
|
##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language) |
|
##### 4. Translate the recognized transcriptions to Finnish, Swedish, Danish |
|
##### 5. Burn the translations to the original video and watch the video in the 2nd video component |
|
''') |
|
|
|
with gr.Column(): |
|
gr.Markdown(''' |
|
### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests) |
|
##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24 |
|
##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren |
|
##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision |
|
''') |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
youtube_url_in.render() |
|
download_youtube_btn = gr.Button("Step 1. Download Youtube video") |
|
download_youtube_btn.click(get_youtube, [youtube_url_in], [ |
|
video_in]) |
|
print(video_in) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
video_in.render() |
|
with gr.Column(): |
|
gr.Markdown(''' |
|
##### Here you can start the transcription and translation process. |
|
##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing) |
|
''') |
|
transcribe_btn = gr.Button("Step 2. Transcribe and translate audio") |
|
|
|
transcribe_btn.click(speech_to_text, [video_in, selected_translation_lang], transcription_df) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
selected_translation_lang.render() |
|
|
|
with gr.Row(): |
|
gr.Markdown(''' |
|
##### Here you will get transcription and translation output |
|
##### If you see error please remember to select translation language |
|
##### ''') |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
transcription_df.render() |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
translate_and_make_srt_btn = gr.Button("Step 3. Create and burn srt to video") |
|
print(video_in) |
|
translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_df,video_in], [ |
|
video_out]) |
|
video_out.render() |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.queue().launch(debug=True, share=False, enable_queue=True) |