|
|
|
import os |
|
os.system('git clone https://github.com/ggerganov/whisper.cpp.git') |
|
os.system('make -C ./whisper.cpp') |
|
MODELS_TO_DOWNLOAD = ['base', 'small', 'tiny', 'medium'] |
|
|
|
for model_name in MODELS_TO_DOWNLOAD: |
|
os.system(f'bash ./whisper.cpp/models/download-ggml-model.sh {model_name}') |
|
|
|
|
|
|
|
import os |
|
import requests |
|
import json |
|
import base64 |
|
|
|
import gradio as gr |
|
from pathlib import Path |
|
import pysrt |
|
import pandas as pd |
|
import re |
|
import time |
|
|
|
import subprocess |
|
import shlex |
|
|
|
from pytube import YouTube |
|
import torch |
|
|
|
INTRO_MSG = ''' |
|
#### <p>There are many not very widely spoken languages for which it is quite hard to find learning materials, |
|
especially well dubbed videos (target language video with target language subs). |
|
This tool will hopefully transcribe and add subs to your videos. |
|
At least for me this is a nice tool to practice both listening and reading skills. |
|
This is a 'one-click' variant of similar spaces found here on the HF hub. |
|
<p>Speech Recognition is based on models from OpenAI Whisper - https://github.com/openai/whisper |
|
<p> This space is using the c++ implementation by https://github.com/ggerganov/whisper.cpp |
|
''' |
|
|
|
|
|
whisper_models = MODELS_TO_DOWNLOAD |
|
|
|
custom_models = [] |
|
combined_models = [] |
|
combined_models.extend(whisper_models) |
|
combined_models.extend(custom_models) |
|
|
|
LANGUAGES = { |
|
"bg": "Bulgarian", |
|
"en": "English", |
|
"zh": "Chinese", |
|
"de": "German", |
|
"es": "Spanish", |
|
"ru": "Russian", |
|
"ko": "Korean", |
|
"fr": "French", |
|
"ja": "Japanese", |
|
"pt": "Portuguese", |
|
"tr": "Turkish", |
|
"pl": "Polish", |
|
"ca": "Catalan", |
|
"nl": "Dutch", |
|
"ar": "Arabic", |
|
"sv": "Swedish", |
|
"it": "Italian", |
|
"id": "Indonesian", |
|
"hi": "Hindi", |
|
"fi": "Finnish", |
|
"vi": "Vietnamese", |
|
"he": "Hebrew", |
|
"uk": "Ukrainian", |
|
"el": "Greek", |
|
"ms": "Malay", |
|
"cs": "Czech", |
|
"ro": "Romanian", |
|
"da": "Danish", |
|
"hu": "Hungarian", |
|
"ta": "Tamil", |
|
"no": "Norwegian", |
|
"th": "Thai", |
|
"ur": "Urdu", |
|
"hr": "Croatian", |
|
"lt": "Lithuanian", |
|
"la": "Latin", |
|
"mi": "Maori", |
|
"ml": "Malayalam", |
|
"cy": "Welsh", |
|
"sk": "Slovak", |
|
"te": "Telugu", |
|
"fa": "Persian", |
|
"lv": "Latvian", |
|
"bn": "Bengali", |
|
"sr": "Serbian", |
|
"az": "Azerbaijani", |
|
"sl": "Slovenian", |
|
"kn": "Kannada", |
|
"et": "Estonian", |
|
"mk": "Macedonian", |
|
"br": "Breton", |
|
"eu": "Basque", |
|
"is": "Icelandic", |
|
"hy": "Armenian", |
|
"ne": "Nepali", |
|
"mn": "Mongolian", |
|
"bs": "Bosnian", |
|
"kk": "Kazakh", |
|
"sq": "Albanian", |
|
"sw": "Swahili", |
|
"gl": "Galician", |
|
"mr": "Marathi", |
|
"pa": "Punjabi", |
|
"si": "Sinhala", |
|
"km": "Khmer", |
|
"sn": "Shona", |
|
"yo": "Yoruba", |
|
"so": "Somali", |
|
"af": "Afrikaans", |
|
"oc": "Occitan", |
|
"ka": "Georgian", |
|
"be": "Belarusian", |
|
"tg": "Tajik", |
|
"sd": "Sindhi", |
|
"gu": "Gujarati", |
|
"am": "Amharic", |
|
"yi": "Yiddish", |
|
"lo": "Lao", |
|
"uz": "Uzbek", |
|
"fo": "Faroese", |
|
"ht": "Haitian creole", |
|
"ps": "Pashto", |
|
"tk": "Turkmen", |
|
"nn": "Nynorsk", |
|
"mt": "Maltese", |
|
"sa": "Sanskrit", |
|
"lb": "Luxembourgish", |
|
"my": "Myanmar", |
|
"bo": "Tibetan", |
|
"tl": "Tagalog", |
|
"mg": "Malagasy", |
|
"as": "Assamese", |
|
"tt": "Tatar", |
|
"haw": "Hawaiian", |
|
"ln": "Lingala", |
|
"ha": "Hausa", |
|
"ba": "Bashkir", |
|
"jw": "Javanese", |
|
"su": "Sundanese", |
|
} |
|
|
|
|
|
source_languages = { |
|
**{language: code for code, language in LANGUAGES.items()} |
|
} |
|
|
|
source_language_list = [key[0] for key in source_languages.items()] |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"DEVICE IS: {device}") |
|
|
|
def get_youtube(video_url): |
|
yt = YouTube(video_url) |
|
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download() |
|
print(f"Download complete - {abs_video_path}") |
|
return abs_video_path |
|
|
|
def run_command(command, app_state): |
|
print(command) |
|
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) |
|
while process.poll() is None: |
|
time.sleep(5) |
|
output = process.stdout.readline() |
|
if output == '' and process.poll() is not None: |
|
break |
|
if output: |
|
decoded = output.decode() |
|
print(decoded) |
|
app_state['output'] += decoded |
|
|
|
rc = process.poll() |
|
return rc |
|
|
|
def speech_to_text(video_file_path, |
|
selected_source_lang, |
|
whisper_model, |
|
app_state): |
|
""" |
|
Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper |
|
This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp |
|
""" |
|
|
|
if(video_file_path == None): |
|
raise ValueError("Error no video input") |
|
|
|
print(video_file_path) |
|
_,file_ending = os.path.splitext(f'{video_file_path}') |
|
input_wav_file = video_file_path.replace(file_ending, ".wav") |
|
srt_path = input_wav_file + ".srt" |
|
vtt_path = input_wav_file + ".vtt" |
|
try: |
|
print(f'file enging is {file_ending}, starting conversion to wav') |
|
subs_paths = video_file_path.replace(file_ending, ".wav") |
|
|
|
if os.path.exists(subs_paths): |
|
os.remove(subs_paths) |
|
|
|
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{subs_paths}"') |
|
print("conversion to wav ready") |
|
|
|
except Exception as e: |
|
raise RuntimeError("Error Running inference with local model", e) |
|
|
|
try: |
|
print("starting whisper c++") |
|
os.system(f'rm -f {srt_path}') |
|
run_command(f'./whisper.cpp/main "{input_wav_file}" -t {os.cpu_count()} -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt -ovtt', |
|
app_state) |
|
|
|
print("whisper c++ finished") |
|
except Exception as e: |
|
raise RuntimeError("Error running Whisper cpp model") |
|
|
|
print(f'Subtitles path {srt_path}, {vtt_path}') |
|
return [vtt_path, srt_path] |
|
|
|
def create_video_player(subs_files, video_in): |
|
print(f"create_video_player - {subs_files}, {video_in}") |
|
|
|
with open(subs_files[0], "rb") as file: |
|
subtitle_base64 = base64.b64encode(file.read()) |
|
|
|
with open(video_in, "rb") as file: |
|
video_base64 = base64.b64encode(file.read()) |
|
|
|
video_player = f'''<video id="video" controls preload="metadata"> |
|
<source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" /> |
|
<track |
|
label="English" |
|
kind="subtitles" |
|
srclang="en" |
|
src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}" |
|
default /> |
|
</video> |
|
''' |
|
|
|
print('create_video_player - Done') |
|
return video_player |
|
|
|
|
|
|
|
video_in = gr.Video(label="Video file", mirror_webcam=False) |
|
youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) |
|
video_out = gr.Video(label="Video Out", mirror_webcam=False) |
|
|
|
selected_source_lang = gr.Dropdown(choices=source_language_list, |
|
type="value", |
|
value= source_language_list[0], |
|
label="Spoken language in video", |
|
interactive=True) |
|
selected_whisper_model = gr.Dropdown(choices=whisper_models, |
|
type="value", |
|
value=whisper_models[0], |
|
label="Selected Whisper model", |
|
interactive=True) |
|
|
|
subtitle_files = gr.File( |
|
label="Download subtitles", |
|
file_count="multiple", |
|
type="file", |
|
interactive=False, |
|
) |
|
|
|
video_player = gr.HTML('<p>video will be played here') |
|
eventslider = gr.Slider(visible=False) |
|
status_msg = gr.Markdown('Status') |
|
output_label = gr.Textbox('', interactive=False, show_label=False) |
|
|
|
demo = gr.Blocks() |
|
demo.encrypt = False |
|
|
|
def set_app_msg(app_state, msg): |
|
app_state['status_msg'] = msg |
|
|
|
def transcribe(app_state, youtube_url_in, selected_source_lang, selected_whisper_model): |
|
app_state['output'] = '' |
|
set_app_msg(app_state, 'Downloading the movie ...') |
|
video_file_path = get_youtube(youtube_url_in) |
|
set_app_msg(app_state, f'Running the speech to text model {selected_source_lang}/{selected_whisper_model}. This can take some time.') |
|
subtitle_files = speech_to_text(video_file_path, selected_source_lang, selected_whisper_model, app_state) |
|
set_app_msg(app_state, f'Creating the video player ...') |
|
video_player = create_video_player(subtitle_files, video_file_path) |
|
set_app_msg(app_state, f'Transcribing done, generating video player') |
|
return subtitle_files, video_player |
|
|
|
|
|
def on_change_event(app_state): |
|
print(f'Running! {app_state}') |
|
return app_state['status_msg'], app_state['output'] |
|
|
|
with demo: |
|
app_state = gr.State({ |
|
'running': False, |
|
'status_msg': '', |
|
'output': '' |
|
}) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown(INTRO_MSG) |
|
gr.Markdown('''### Copy any non-private Youtube video URL to box below or click one of the examples.''') |
|
examples = gr.Examples(examples=["https://www.youtube.com/watch?v=UjAn3Pza3qo", "https://www.youtube.com/watch?v=oOZivhYfPD4"], |
|
label="Examples", inputs=[youtube_url_in]) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
youtube_url_in.render() |
|
selected_source_lang.render() |
|
selected_whisper_model.render() |
|
|
|
download_youtube_btn = gr.Button("Transcribe the video") |
|
download_youtube_btn.click(transcribe, [app_state, youtube_url_in, selected_source_lang, selected_whisper_model], [subtitle_files, video_player]) |
|
|
|
eventslider.render() |
|
status_msg.render() |
|
output_label.render() |
|
subtitle_files.render() |
|
video_player.render() |
|
with gr.Row(): |
|
gr.Markdown('This app is based on [this code](https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles/tree/main) by RASMUS.') |
|
|
|
dep = demo.load(on_change_event, inputs=[app_state], outputs=[status_msg, output_label], every=10) |
|
|
|
|
|
|
|
is_kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE') |
|
print(is_kaggle) |
|
|
|
if is_kaggle: |
|
demo.queue().launch(share=True, debug=True) |
|
else: |
|
demo.queue().launch() |
|
|