Spaces:

Karwasze
/

Whisper-ASR-youtube-subtitles

Build error

App Files Files Community

Karwasze

RASMUS commited on Nov 18, 2022

Commit

48b9dd3

•

0 Parent(s):

Duplicate from Finnish-NLP/Whisper-ASR-youtube-subtitles

Browse files

Co-authored-by: TOIVANEN <RASMUS@users.noreply.huggingface.co>

Files changed (5) hide show

.gitattributes +31 -0
README.md +14 -0
app.py +271 -0
packages.txt +1 -0
requirements.txt +16 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,31 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Whisper ASR Youtube subtitles creator
+emoji: 👁
+colorFrom: yellow
+colorTo: blue
+sdk: gradio
+sdk_version: 3.9
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: Finnish-NLP/Whisper-ASR-youtube-subtitles
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import gradio as gr
+import os
+from pathlib import Path
+import time
+import pandas as pd
+import re
+import time
+import os
+import whisper
+from pytube import YouTube
+import psutil
+num_cores = psutil.cpu_count()
+os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
+import torch
+# is cuda available?
+from easynmt import EasyNMT
+translation_model = EasyNMT('m2m_100_418M', max_new_tokens=60, max_length=60)
+asr_model = whisper.load_model("base")
+transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
+translation_models = {
+"Finnish": "fi",
+"Swedish": "sv",
+"Danish": "da",
+"English": "en",
+"German": "de"
+}
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("DEVICE IS: ")
+print(device)
+videos_out_path = Path("./videos_out")
+videos_out_path.mkdir(parents=True, exist_ok=True)
+def get_youtube(video_url):
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
+    print("LADATATTU POLKUUN")
+    print(abs_video_path)
+    return abs_video_path
+async def speech_to_text(video_file_path, selected_translation_lang):
+    """
+    # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
+    # Currently supports only English audio
+    This space allows you to:
+    1. Download youtube video with a given url
+    2. Watch it in the first video component
+    3. Run automatic speech recognition on the video using Whisper
+    4. Translate the recognized transcriptions to Finnish, Swedish, Danish, English, German (More languages coming later)
+    5. Burn the translations to the original video and watch the video in the 2nd video component
+    Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
+    """
+    if(video_file_path == None):
+        raise ValueError("Error no video input")
+    print(video_file_path)
+    try:
+        audio = whisper.load_audio(video_file_path)
+    except Exception as e:
+        raise RuntimeError("Error converting video to audio")
+    last_time = time.time()
+    try:
+        print(f'Transcribing via local model')
+        transcribe_options = dict(beam_size=5, best_of=5, without_timestamps=False)
+        transcription = asr_model.transcribe(audio, **transcribe_options)
+        #translation_options = dict(language=selected_translation_lang, beam_size=5, best_of=5, without_timestamps=False)
+        #translations = asr_model.transcribe(audio, **translation_options)
+        df = pd.DataFrame(columns=['start','end','text'])
+        for i,segment in enumerate(transcription['segments']):
+            new_row = {'start': segment['start'],
+            'end': segment['end'],
+            'text': segment['text']
+                            }
+            df = df.append(new_row, ignore_index=True)
+        if selected_translation_lang is None:
+                    selected_translation_lang = 'Finnish'
+        sentences = df['text']
+        df['translation'] = translation_model.translate(sentences, target_lang=translation_models.get(selected_translation_lang))
+        print('After translation to target language \n')
+        return (df)
+    except Exception as e:
+        raise RuntimeError("Error Running inference with local model", e)
+def create_srt_and_burn(df, video_in):
+    print("Starting creation of video wit srt")
+    with open('testi.srt','w', encoding="utf-8") as file:
+        for i in range(len(df)):
+            file.write(str(i+1))
+            file.write('\n')
+            start = df.iloc[i]['start']
+            milliseconds = round(start * 1000.0)
+            hours = milliseconds // 3_600_000
+            milliseconds -= hours * 3_600_000
+            minutes = milliseconds // 60_000
+            milliseconds -= minutes * 60_000
+            seconds = milliseconds // 1_000
+            milliseconds -= seconds * 1_000
+            file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
+            stop = df.iloc[i]['end']
+            milliseconds = round(stop * 1000.0)
+            hours = milliseconds // 3_600_000
+            milliseconds -= hours * 3_600_000
+            minutes = milliseconds // 60_000
+            milliseconds -= minutes * 60_000
+            seconds = milliseconds // 1_000
+            milliseconds -= seconds * 1_000
+            file.write(' --> ')
+            file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}")
+            file.write('\n')
+            file.writelines(df.iloc[i]['translation'])
+            if int(i) != len(df)-1:
+                file.write('\n\n')
+    print("SRT DONE")
+    try:
+        file1 = open('./testi.srt', 'r', encoding="utf-8")
+        Lines = file1.readlines()
+        count = 0
+        # Strips the newline character
+        for line in Lines:
+            count += 1
+            print("{}".format(line))
+        print(type(video_in))
+        print(video_in)
+        video_out = video_in.replace('.mp4', '_out.mp4')
+        print(video_out)
+        command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out)
+        print(command)
+        os.system(command)
+        return video_out
+    except Exception as e:
+        print(e)
+        return video_out
+# ---- Gradio Layout -----
+video_in = gr.Video(label="Video file", mirror_webcam=False)
+youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+video_out = gr.Video(label="Video Out", mirror_webcam=False)
+df_init = pd.DataFrame(columns=['start','end','text','translation'])
+selected_translation_lang = gr.Dropdown(choices=["English", "German","Finnish","Swedish", "Danish"], type="value", value="English", label="Language to translate transcriptions to", interactive=True)
+transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10)
+demo = gr.Blocks(css='''
+#cut_btn, #reset_btn { align-self:stretch; }
+#\\31 3 { max-width: 540px; }
+.output-markdown {max-width: 65ch !important;}
+''')
+demo.encrypt = False
+with demo:
+    transcription_var = gr.Variable()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            ### This space allows you to:
+            ##### 1. Download youtube video with a given URL
+            ##### 2. Watch it in the first video component
+            ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language)
+            ##### 4. Translate the recognized transcriptions to English, Finnish, Swedish, Danish and German
+            ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
+            ''')
+        with gr.Column():
+            gr.Markdown('''
+            ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests)
+            ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
+            ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
+            ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
+            ''')
+    with gr.Row():
+        with gr.Column():
+            youtube_url_in.render()
+            download_youtube_btn = gr.Button("Step 1. Download Youtube video")
+            download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                video_in])
+            print(video_in)
+    with gr.Row():
+        with gr.Column():
+            video_in.render()
+            with gr.Column():
+                gr.Markdown('''
+                ##### Here you can start the transcription and translation process.
+                ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing)
+                ''')
+            transcribe_btn = gr.Button("Step 2. Transcribe and translate audio")
+            transcribe_btn.click(speech_to_text, [video_in, selected_translation_lang], transcription_df)
+    with gr.Row():
+        with gr.Column():
+            selected_translation_lang.render()
+    with gr.Row():
+        gr.Markdown('''
+        ##### Here you will get transcription and translation output
+        ##### If you see error please remember to select translation language
+        ##### ''')
+    with gr.Row():
+        with gr.Column():
+            transcription_df.render()
+    with gr.Row():
+        with gr.Column():
+            translate_and_make_srt_btn = gr.Button("Step 3. Create and burn srt to video")
+            print(video_in)
+            translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_df,video_in], [
+                video_out])
+            video_out.render()
+if __name__ == "__main__":
+    demo.launch(debug=True)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+fastapi==0.85.0
+ffmpeg-python==0.2.0
+gradio==3.9
+matplotlib==3.6.1
+pandas==1.5.0
+pytube==12.1.0
+sacremoses==0.0.53
+sentencepiece==0.1.97
+tokenizers==0.12.1
+torch==1.12.1
+torchaudio==0.12.1
+tqdm==4.64.1
+EasyNMT==2.0.2
+transformers==4.22.2
+whisper @ git+https://github.com/openai/whisper.git
+psutil==5.9.2