Spaces:

IRISLAB
/

Healthdatalab-Transcription

Sleeping

App Files Files Community

IRISLAB commited on 9 days ago

Commit

84ffa23

•

1 Parent(s): 17d8211

Upload 28 files

Browse files

Files changed (28) hide show

.gitignore +6 -0
app.py +358 -0
models/models will be saved here.txt +0 -0
modules/__init__.py +0 -0
modules/__pycache__/__init__.cpython-312.pyc +0 -0
modules/__pycache__/deepl_api.cpython-312.pyc +0 -0
modules/__pycache__/faster_whisper_inference.cpython-312.pyc +0 -0
modules/__pycache__/nllb_inference.cpython-312.pyc +0 -0
modules/__pycache__/subtitle_manager.cpython-312.pyc +0 -0
modules/__pycache__/translation_base.cpython-312.pyc +0 -0
modules/__pycache__/whisper_Inference.cpython-312.pyc +0 -0
modules/__pycache__/whisper_base.cpython-312.pyc +0 -0
modules/__pycache__/whisper_parameter.cpython-312.pyc +0 -0
modules/__pycache__/youtube_manager.cpython-312.pyc +0 -0
modules/deepl_api.py +196 -0
modules/faster_whisper_inference.py +154 -0
modules/nllb_inference.py +253 -0
modules/subtitle_manager.py +135 -0
modules/translation_base.py +148 -0
modules/whisper_Inference.py +97 -0
modules/whisper_base.py +333 -0
modules/whisper_parameter.py +153 -0
modules/youtube_manager.py +15 -0
requirements.txt +7 -0
ui/__init__.py +0 -0
ui/__pycache__/__init__.cpython-312.pyc +0 -0
ui/__pycache__/htmls.cpython-312.pyc +0 -0
ui/htmls.py +97 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+venv/
+ui/__pycache__/
+outputs/
+modules/__pycache__/
+models/
+modules/yt_tmp.wav

app.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import gradio as gr
+import os
+import argparse
+from modules.whisper_Inference import WhisperInference
+from modules.faster_whisper_inference import FasterWhisperInference
+from modules.nllb_inference import NLLBInference
+from ui.htmls import *
+from modules.youtube_manager import get_ytmetas
+from modules.deepl_api import DeepLAPI
+from modules.whisper_parameter import *
+class App:
+    def __init__(self, args):
+        self.args = args
+        self.app = gr.Blocks(css=CSS, theme=self.args.theme)
+        self.whisper_inf = self.init_whisper()
+        print(f"Use \"{self.args.whisper_type}\" implementation")
+        print(f"Device \"{self.whisper_inf.device}\" is detected")
+        self.nllb_inf = NLLBInference()
+        self.deepl_api = DeepLAPI()
+    def init_whisper(self):
+        whisper_type = self.args.whisper_type.lower().strip()
+        if whisper_type in ["faster_whisper", "faster-whisper"]:
+            whisper_inf = FasterWhisperInference()
+            whisper_inf.model_dir = self.args.faster_whisper_model_dir
+        if whisper_type in ["whisper"]:
+            whisper_inf = WhisperInference()
+            whisper_inf.model_dir = self.args.whisper_model_dir
+        else:
+            whisper_inf = FasterWhisperInference()
+            whisper_inf.model_dir = self.args.faster_whisper_model_dir
+        return whisper_inf
+    @staticmethod
+    def open_folder(folder_path: str):
+        if os.path.exists(folder_path):
+            os.system(f"start {folder_path}")
+        else:
+            print(f"The folder {folder_path} does not exist.")
+    @staticmethod
+    def on_change_models(model_size: str):
+        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
+        if model_size not in translatable_model:
+            return gr.Checkbox(visible=False, value=False, interactive=False)
+        else:
+            return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
+    def launch(self):
+        with self.app:
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown(MARKDOWN, elem_id="md_project")
+            with gr.Tabs():
+                with gr.TabItem("File"):  # tab1
+                    with gr.Row():
+                        input_file = gr.Files(type="filepath", label="Upload File here")
+                    with gr.Row():
+                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
+                                               label="Model")
+                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                              value="Automatic Detection", label="Language")
+                        dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+                    with gr.Row():
+                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Row():
+                        cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
+                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
+                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
+                        sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
+                        nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Row():
+                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
+                    with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=5)
+                        files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
+                        btn_openfolder = gr.Button('📂', scale=1)
+                    params = [input_file, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience,
+                                                             condition_on_previous_text=cb_condition_on_previous_text,
+                                                             initial_prompt=tb_initial_prompt,
+                                                             temperature=sd_temperature,
+                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                             vad_filter=cb_vad_filter,
+                                                             threshold=sd_threshold,
+                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                             max_speech_duration_s=nb_max_speech_duration_s,
+                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                             window_size_sample=nb_window_size_sample,
+                                                             speech_pad_ms=nb_speech_pad_ms)
+                    btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                  inputs=params + whisper_params.to_list(),
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+                with gr.TabItem("Youtube"):  # tab2
+                    with gr.Row():
+                        tb_youtubelink = gr.Textbox(label="Youtube Link")
+                    with gr.Row(equal_height=True):
+                        with gr.Column():
+                            img_thumbnail = gr.Image(label="Youtube Thumbnail")
+                        with gr.Column():
+                            tb_title = gr.Label(label="Youtube Title")
+                            tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
+                    with gr.Row():
+                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
+                                               label="Model")
+                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                              value="Automatic Detection", label="Language")
+                        dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+                    with gr.Row():
+                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Row():
+                        cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
+                                                   interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
+                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
+                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
+                        sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
+                        nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Row():
+                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
+                    with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=5)
+                        files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                        btn_openfolder = gr.Button('📂', scale=1)
+                    params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience,
+                                                             condition_on_previous_text=cb_condition_on_previous_text,
+                                                             initial_prompt=tb_initial_prompt,
+                                                             temperature=sd_temperature,
+                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                             vad_filter=cb_vad_filter,
+                                                             threshold=sd_threshold,
+                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                             max_speech_duration_s=nb_max_speech_duration_s,
+                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                             window_size_sample=nb_window_size_sample,
+                                                             speech_pad_ms=nb_speech_pad_ms)
+                    btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                  inputs=params + whisper_params.to_list(),
+                                  outputs=[tb_indicator, files_subtitles])
+                    tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
+                                          outputs=[img_thumbnail, tb_title, tb_description])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+                with gr.TabItem("Mic"):  # tab3
+                    with gr.Row():
+                        mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
+                    with gr.Row():
+                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
+                                               label="Model")
+                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                              value="Automatic Detection", label="Language")
+                        dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+                    with gr.Row():
+                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
+                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
+                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
+                        sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
+                    with gr.Row():
+                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
+                    with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=5)
+                        files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                        btn_openfolder = gr.Button('📂', scale=1)
+                    params = [mic_input, dd_file_format]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience,
+                                                             condition_on_previous_text=cb_condition_on_previous_text,
+                                                             initial_prompt=tb_initial_prompt,
+                                                             temperature=sd_temperature,
+                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                             vad_filter=cb_vad_filter,
+                                                             threshold=sd_threshold,
+                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                             max_speech_duration_s=nb_max_speech_duration_s,
+                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                             window_size_sample=nb_window_size_sample,
+                                                             speech_pad_ms=nb_speech_pad_ms)
+                    btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                  inputs=params + whisper_params.to_list(),
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+                with gr.TabItem("T2T Translation"):  # tab 4
+                    with gr.Row():
+                        file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here",
+                                             file_types=['.vtt', '.srt'])
+                    with gr.TabItem("DeepL API"):  # sub tab1
+                        with gr.Row():
+                            tb_authkey = gr.Textbox(label="Your Auth Key (API KEY)",
+                                                    value="")
+                        with gr.Row():
+                            dd_deepl_sourcelang = gr.Dropdown(label="Source Language", value="Automatic Detection",
+                                                              choices=list(
+                                                                  self.deepl_api.available_source_langs.keys()))
+                            dd_deepl_targetlang = gr.Dropdown(label="Target Language", value="English",
+                                                              choices=list(
+                                                                  self.deepl_api.available_target_langs.keys()))
+                        with gr.Row():
+                            cb_deepl_ispro = gr.Checkbox(label="Pro User?", value=False)
+                        with gr.Row():
+                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label="Output", scale=5)
+                            files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                    btn_run.click(fn=self.deepl_api.translate_deepl,
+                                  inputs=[tb_authkey, file_subs, dd_deepl_sourcelang, dd_deepl_targetlang,
+                                          cb_deepl_ispro],
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
+                                         inputs=None,
+                                         outputs=None)
+                    with gr.TabItem("NLLB"):  # sub tab2
+                        with gr.Row():
+                            dd_nllb_model = gr.Dropdown(label="Model", value="facebook/nllb-200-1.3B",
+                                                        choices=self.nllb_inf.available_models)
+                            dd_nllb_sourcelang = gr.Dropdown(label="Source Language",
+                                                             choices=self.nllb_inf.available_source_langs)
+                            dd_nllb_targetlang = gr.Dropdown(label="Target Language",
+                                                             choices=self.nllb_inf.available_target_langs)
+                        with gr.Row():
+                            cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
+                                                       interactive=True)
+                        with gr.Row():
+                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label="Output", scale=5)
+                            files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        with gr.Column():
+                            md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
+                    btn_run.click(fn=self.nllb_inf.translate_file,
+                                  inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang, cb_timestamp],
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
+                                         inputs=None,
+                                         outputs=None)
+        # Launch the app with optional gradio settings
+        launch_args = {}
+        if self.args.share:
+            launch_args['share'] = self.args.share
+        if self.args.server_name:
+            launch_args['server_name'] = self.args.server_name
+        if self.args.server_port:
+            launch_args['server_port'] = self.args.server_port
+        if self.args.username and self.args.password:
+            launch_args['auth'] = (self.args.username, self.args.password)
+        launch_args['inbrowser'] = True
+        self.app.queue(api_open=False).launch(**launch_args)
+# Create the parser for command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--whisper_type', type=str, default="faster-whisper", help='A type of the whisper implementation between: ["whisper", "faster-whisper"]')
+parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
+parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
+parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
+parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
+parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
+parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
+parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
+parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
+parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
+_args = parser.parse_args()
+if __name__ == "__main__":
+    app = App(args=_args)
+    app.launch()

models/models will be saved here.txt ADDED Viewed

File without changes

modules/__init__.py ADDED Viewed

File without changes

modules/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (152 Bytes). View file

modules/__pycache__/deepl_api.cpython-312.pyc ADDED Viewed

Binary file (7.51 kB). View file

modules/__pycache__/faster_whisper_inference.cpython-312.pyc ADDED Viewed

Binary file (7.5 kB). View file

modules/__pycache__/nllb_inference.cpython-312.pyc ADDED Viewed

Binary file (11.9 kB). View file

modules/__pycache__/subtitle_manager.cpython-312.pyc ADDED Viewed

Binary file (6.03 kB). View file

modules/__pycache__/translation_base.cpython-312.pyc ADDED Viewed

Binary file (7.46 kB). View file

modules/__pycache__/whisper_Inference.cpython-312.pyc ADDED Viewed

Binary file (4.76 kB). View file

modules/__pycache__/whisper_base.cpython-312.pyc ADDED Viewed

Binary file (14.5 kB). View file

modules/__pycache__/whisper_parameter.cpython-312.pyc ADDED Viewed

Binary file (2.96 kB). View file

modules/__pycache__/youtube_manager.cpython-312.pyc ADDED Viewed

Binary file (1.03 kB). View file

modules/deepl_api.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import requests
+import time
+import os
+from datetime import datetime
+import gradio as gr
+from modules.subtitle_manager import *
+"""
+This is written with reference to the DeepL API documentation.
+If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
+"""
+DEEPL_AVAILABLE_TARGET_LANGS = {
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'English (British)': 'EN-GB',
+    'English (American)': 'EN-US',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese': 'PT',
+    'Portuguese (Brazilian)': 'PT-BR',
+    'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese (simplified)': 'ZH'
+}
+DEEPL_AVAILABLE_SOURCE_LANGS = {
+    'Automatic Detection': None,
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese (all Portuguese varieties mixed)': 'PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese': 'ZH'
+}
+class DeepLAPI:
+    def __init__(self):
+        self.api_interval = 1
+        self.max_text_batch_size = 50
+        self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
+        self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
+    def translate_deepl(self,
+                        auth_key: str,
+                        fileobjs: list,
+                        source_lang: str,
+                        target_lang: str,
+                        is_pro: bool,
+                        progress=gr.Progress()) -> list:
+        """
+        Translate subtitle files using DeepL API
+        Parameters
+        ----------
+        auth_key: str
+            API Key for DeepL from gr.Textbox()
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        source_lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        target_lang: str
+            Target language of the file to transcribe from gr.Dropdown()
+        is_pro: str
+            Boolean value that is about pro user or not from gr.Checkbox().
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        files_info = {}
+        for fileobj in fileobjs:
+            file_path = fileobj.name
+            file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+            if file_ext == ".srt":
+                parsed_dicts = parse_srt(file_path=file_path)
+                batch_size = self.max_text_batch_size
+                for batch_start in range(0, len(parsed_dicts), batch_size):
+                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
+                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                    target_lang, is_pro)
+                    for i, translated_text in enumerate(translated_texts):
+                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+                    progress(batch_end / len(parsed_dicts), desc="Translating..")
+                subtitle = get_serialized_srt(parsed_dicts)
+                timestamp = datetime.now().strftime("%m%d%H%M%S")
+                file_name = file_name[:-9]
+                output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}.srt")
+                write_file(subtitle, output_path)
+            elif file_ext == ".vtt":
+                parsed_dicts = parse_vtt(file_path=file_path)
+                batch_size = self.max_text_batch_size
+                for batch_start in range(0, len(parsed_dicts), batch_size):
+                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
+                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                    target_lang, is_pro)
+                    for i, translated_text in enumerate(translated_texts):
+                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+                    progress(batch_end / len(parsed_dicts), desc="Translating..")
+                subtitle = get_serialized_vtt(parsed_dicts)
+                timestamp = datetime.now().strftime("%m%d%H%M%S")
+                file_name = file_name[:-9]
+                output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}.vtt")
+                write_file(subtitle, output_path)
+            files_info[file_name] = subtitle
+        total_result = ''
+        for file_name, subtitle in files_info.items():
+            total_result += '------------------------------------\n'
+            total_result += f'{file_name}\n\n'
+            total_result += f'{subtitle}'
+        gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+        return [gr_str, output_path]
+    def request_deepl_translate(self,
+                                auth_key: str,
+                                text: list,
+                                source_lang: str,
+                                target_lang: str,
+                                is_pro: bool):
+        """Request API response to DeepL server"""
+        url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
+        headers = {
+            'Authorization': f'DeepL-Auth-Key {auth_key}'
+        }
+        data = {
+            'text': text,
+            'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
+            'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
+        }
+        response = requests.post(url, headers=headers, data=data).json()
+        time.sleep(self.api_interval)
+        return response["translations"]

modules/faster_whisper_inference.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import time
+import numpy as np
+from typing import BinaryIO, Union, Tuple, List
+import faster_whisper
+from faster_whisper.vad import VadOptions
+import ctranslate2
+import whisper
+import gradio as gr
+from modules.whisper_parameter import *
+from modules.whisper_base import WhisperBase
+# Temporal fix of the issue : https://github.com/jhj0517/Whisper-WebUI/issues/144
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+class FasterWhisperInference(WhisperBase):
+    def __init__(self):
+        super().__init__(
+            model_dir=os.path.join("models", "Whisper", "faster-whisper")
+        )
+        self.model_paths = self.get_model_paths()
+        self.available_models = self.model_paths.keys()
+        self.available_compute_types = ctranslate2.get_supported_compute_types(
+            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ) -> Tuple[List[dict], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperValues(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        if params.lang == "Automatic Detection":
+            params.lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.lang = language_code_dict[params.lang]
+        vad_options = VadOptions(
+            threshold=params.threshold,
+            min_speech_duration_ms=params.min_speech_duration_ms,
+            max_speech_duration_s=params.max_speech_duration_s,
+            min_silence_duration_ms=params.min_silence_duration_ms,
+            window_size_samples=params.window_size_samples,
+            speech_pad_ms=params.speech_pad_ms
+        )
+        segments, info = self.model.transcribe(
+            audio=audio,
+            language=params.lang,
+            task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+            beam_size=params.beam_size,
+            log_prob_threshold=params.log_prob_threshold,
+            no_speech_threshold=params.no_speech_threshold,
+            best_of=params.best_of,
+            patience=params.patience,
+            temperature=params.temperature,
+            compression_ratio_threshold=params.compression_ratio_threshold,
+            vad_filter=params.vad_filter,
+            vad_parameters=vad_options
+        )
+        progress(0, desc="Loading audio..")
+        segments_result = []
+        for segment in segments:
+            progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append({
+                "start": segment.start,
+                "end": segment.end,
+                "text": segment.text
+            })
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        self.current_model_size = self.model_paths[model_size]
+        self.current_compute_type = compute_type
+        self.model = faster_whisper.WhisperModel(
+            device=self.device,
+            model_size_or_path=self.current_model_size,
+            download_root=self.model_dir,
+            compute_type=self.current_compute_type
+        )
+    def get_model_paths(self):
+        """
+        Get available models from models path including fine-tuned model.
+        Returns
+        ----------
+        Name list of models
+        """
+        model_paths = {model:model for model in whisper.available_models()}
+        faster_whisper_prefix = "models--Systran--faster-whisper-"
+        existing_models = os.listdir(self.model_dir)
+        wrong_dirs = [".locks"]
+        existing_models = list(set(existing_models) - set(wrong_dirs))
+        webui_dir = os.getcwd()
+        for model_name in existing_models:
+            if faster_whisper_prefix in model_name:
+                model_name = model_name[len(faster_whisper_prefix):]
+            if model_name not in whisper.available_models():
+                model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
+        return model_paths

modules/nllb_inference.py ADDED Viewed

	@@ -0,0 +1,253 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import gradio as gr
+import os
+from modules.translation_base import TranslationBase
+class NLLBInference(TranslationBase):
+    def __init__(self):
+        super().__init__(
+            model_dir=os.path.join("models", "NLLB")
+        )
+        self.tokenizer = None
+        self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
+        self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.pipeline = None
+    def translate(self,
+                  text: str
+                  ):
+        result = self.pipeline(text)
+        return result[0]['translation_text']
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress
+                     ):
+        if model_size != self.current_model_size or self.model is None:
+            print("\nInitializing NLLB Model..\n")
+            progress(0, desc="Initializing NLLB Model..")
+            self.current_model_size = model_size
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                               cache_dir=self.model_dir)
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                           cache_dir=os.path.join(self.model_dir, "tokenizers"))
+        src_lang = NLLB_AVAILABLE_LANGS[src_lang]
+        tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
+        self.pipeline = pipeline("translation",
+                                 model=self.model,
+                                 tokenizer=self.tokenizer,
+                                 src_lang=src_lang,
+                                 tgt_lang=tgt_lang,
+                                 device=self.device)
+NLLB_AVAILABLE_LANGS = {
+    "Acehnese (Arabic script)": "ace_Arab",
+    "Acehnese (Latin script)": "ace_Latn",
+    "Mesopotamian Arabic": "acm_Arab",
+    "Ta’izzi-Adeni Arabic": "acq_Arab",
+    "Tunisian Arabic": "aeb_Arab",
+    "Afrikaans": "afr_Latn",
+    "South Levantine Arabic": "ajp_Arab",
+    "Akan": "aka_Latn",
+    "Amharic": "amh_Ethi",
+    "North Levantine Arabic": "apc_Arab",
+    "Modern Standard Arabic": "arb_Arab",
+    "Modern Standard Arabic (Romanized)": "arb_Latn",
+    "Najdi Arabic": "ars_Arab",
+    "Moroccan Arabic": "ary_Arab",
+    "Egyptian Arabic": "arz_Arab",
+    "Assamese": "asm_Beng",
+    "Asturian": "ast_Latn",
+    "Awadhi": "awa_Deva",
+    "Central Aymara": "ayr_Latn",
+    "South Azerbaijani": "azb_Arab",
+    "North Azerbaijani": "azj_Latn",
+    "Bashkir": "bak_Cyrl",
+    "Bambara": "bam_Latn",
+    "Balinese": "ban_Latn",
+    "Belarusian": "bel_Cyrl",
+    "Bemba": "bem_Latn",
+    "Bengali": "ben_Beng",
+    "Bhojpuri": "bho_Deva",
+    "Banjar (Arabic script)": "bjn_Arab",
+    "Banjar (Latin script)": "bjn_Latn",
+    "Standard Tibetan": "bod_Tibt",
+    "Bosnian": "bos_Latn",
+    "Buginese": "bug_Latn",
+    "Bulgarian": "bul_Cyrl",
+    "Catalan": "cat_Latn",
+    "Cebuano": "ceb_Latn",
+    "Czech": "ces_Latn",
+    "Chokwe": "cjk_Latn",
+    "Central Kurdish": "ckb_Arab",
+    "Crimean Tatar": "crh_Latn",
+    "Welsh": "cym_Latn",
+    "Danish": "dan_Latn",
+    "German": "deu_Latn",
+    "Southwestern Dinka": "dik_Latn",
+    "Dyula": "dyu_Latn",
+    "Dzongkha": "dzo_Tibt",
+    "Greek": "ell_Grek",
+    "English": "eng_Latn",
+    "Esperanto": "epo_Latn",
+    "Estonian": "est_Latn",
+    "Basque": "eus_Latn",
+    "Ewe": "ewe_Latn",
+    "Faroese": "fao_Latn",
+    "Fijian": "fij_Latn",
+    "Finnish": "fin_Latn",
+    "Fon": "fon_Latn",
+    "French": "fra_Latn",
+    "Friulian": "fur_Latn",
+    "Nigerian Fulfulde": "fuv_Latn",
+    "Scottish Gaelic": "gla_Latn",
+    "Irish": "gle_Latn",
+    "Galician": "glg_Latn",
+    "Guarani": "grn_Latn",
+    "Gujarati": "guj_Gujr",
+    "Haitian Creole": "hat_Latn",
+    "Hausa": "hau_Latn",
+    "Hebrew": "heb_Hebr",
+    "Hindi": "hin_Deva",
+    "Chhattisgarhi": "hne_Deva",
+    "Croatian": "hrv_Latn",
+    "Hungarian": "hun_Latn",
+    "Armenian": "hye_Armn",
+    "Igbo": "ibo_Latn",
+    "Ilocano": "ilo_Latn",
+    "Indonesian": "ind_Latn",
+    "Icelandic": "isl_Latn",
+    "Italian": "ita_Latn",
+    "Javanese": "jav_Latn",
+    "Japanese": "jpn_Jpan",
+    "Kabyle": "kab_Latn",
+    "Jingpho": "kac_Latn",
+    "Kamba": "kam_Latn",
+    "Kannada": "kan_Knda",
+    "Kashmiri (Arabic script)": "kas_Arab",
+    "Kashmiri (Devanagari script)": "kas_Deva",
+    "Georgian": "kat_Geor",
+    "Central Kanuri (Arabic script)": "knc_Arab",
+    "Central Kanuri (Latin script)": "knc_Latn",
+    "Kazakh": "kaz_Cyrl",
+    "Kabiyè": "kbp_Latn",
+    "Kabuverdianu": "kea_Latn",
+    "Khmer": "khm_Khmr",
+    "Kikuyu": "kik_Latn",
+    "Kinyarwanda": "kin_Latn",
+    "Kyrgyz": "kir_Cyrl",
+    "Kimbundu": "kmb_Latn",
+    "Northern Kurdish": "kmr_Latn",
+    "Kikongo": "kon_Latn",
+    "Korean": "kor_Hang",
+    "Lao": "lao_Laoo",
+    "Ligurian": "lij_Latn",
+    "Limburgish": "lim_Latn",
+    "Lingala": "lin_Latn",
+    "Lithuanian": "lit_Latn",
+    "Lombard": "lmo_Latn",
+    "Latgalian": "ltg_Latn",
+    "Luxembourgish": "ltz_Latn",
+    "Luba-Kasai": "lua_Latn",
+    "Ganda": "lug_Latn",
+    "Luo": "luo_Latn",
+    "Mizo": "lus_Latn",
+    "Standard Latvian": "lvs_Latn",
+    "Magahi": "mag_Deva",
+    "Maithili": "mai_Deva",
+    "Malayalam": "mal_Mlym",
+    "Marathi": "mar_Deva",
+    "Minangkabau (Arabic script)": "min_Arab",
+    "Minangkabau (Latin script)": "min_Latn",
+    "Macedonian": "mkd_Cyrl",
+    "Plateau Malagasy": "plt_Latn",
+    "Maltese": "mlt_Latn",
+    "Meitei (Bengali script)": "mni_Beng",
+    "Halh Mongolian": "khk_Cyrl",
+    "Mossi": "mos_Latn",
+    "Maori": "mri_Latn",
+    "Burmese": "mya_Mymr",
+    "Dutch": "nld_Latn",
+    "Norwegian Nynorsk": "nno_Latn",
+    "Norwegian Bokmål": "nob_Latn",
+    "Nepali": "npi_Deva",
+    "Northern Sotho": "nso_Latn",
+    "Nuer": "nus_Latn",
+    "Nyanja": "nya_Latn",
+    "Occitan": "oci_Latn",
+    "West Central Oromo": "gaz_Latn",
+    "Odia": "ory_Orya",
+    "Pangasinan": "pag_Latn",
+    "Eastern Panjabi": "pan_Guru",
+    "Papiamento": "pap_Latn",
+    "Western Persian": "pes_Arab",
+    "Polish": "pol_Latn",
+    "Portuguese": "por_Latn",
+    "Dari": "prs_Arab",
+    "Southern Pashto": "pbt_Arab",
+    "Ayacucho Quechua": "quy_Latn",
+    "Romanian": "ron_Latn",
+    "Rundi": "run_Latn",
+    "Russian": "rus_Cyrl",
+    "Sango": "sag_Latn",
+    "Sanskrit": "san_Deva",
+    "Santali": "sat_Olck",
+    "Sicilian": "scn_Latn",
+    "Shan": "shn_Mymr",
+    "Sinhala": "sin_Sinh",
+    "Slovak": "slk_Latn",
+    "Slovenian": "slv_Latn",
+    "Samoan": "smo_Latn",
+    "Shona": "sna_Latn",
+    "Sindhi": "snd_Arab",
+    "Somali": "som_Latn",
+    "Southern Sotho": "sot_Latn",
+    "Spanish": "spa_Latn",
+    "Tosk Albanian": "als_Latn",
+    "Sardinian": "srd_Latn",
+    "Serbian": "srp_Cyrl",
+    "Swati": "ssw_Latn",
+    "Sundanese": "sun_Latn",
+    "Swedish": "swe_Latn",
+    "Swahili": "swh_Latn",
+    "Silesian": "szl_Latn",
+    "Tamil": "tam_Taml",
+    "Tatar": "tat_Cyrl",
+    "Telugu": "tel_Telu",
+    "Tajik": "tgk_Cyrl",
+    "Tagalog": "tgl_Latn",
+    "Thai": "tha_Thai",
+    "Tigrinya": "tir_Ethi",
+    "Tamasheq (Latin script)": "taq_Latn",
+    "Tamasheq (Tifinagh script)": "taq_Tfng",
+    "Tok Pisin": "tpi_Latn",
+    "Tswana": "tsn_Latn",
+    "Tsonga": "tso_Latn",
+    "Turkmen": "tuk_Latn",
+    "Tumbuka": "tum_Latn",
+    "Turkish": "tur_Latn",
+    "Twi": "twi_Latn",
+    "Central Atlas Tamazight": "tzm_Tfng",
+    "Uyghur": "uig_Arab",
+    "Ukrainian": "ukr_Cyrl",
+    "Umbundu": "umb_Latn",
+    "Urdu": "urd_Arab",
+    "Northern Uzbek": "uzn_Latn",
+    "Venetian": "vec_Latn",
+    "Vietnamese": "vie_Latn",
+    "Waray": "war_Latn",
+    "Wolof": "wol_Latn",
+    "Xhosa": "xho_Latn",
+    "Eastern Yiddish": "ydd_Hebr",
+    "Yoruba": "yor_Latn",
+    "Yue Chinese": "yue_Hant",
+    "Chinese (Simplified)": "zho_Hans",
+    "Chinese (Traditional)": "zho_Hant",
+    "Standard Malay": "zsm_Latn",
+    "Zulu": "zul_Latn",
+}

modules/subtitle_manager.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import re
+def timeformat_srt(time):
+    hours = time // 3600
+    minutes = (time - hours * 3600) // 60
+    seconds = time - hours * 3600 - minutes * 60
+    milliseconds = (time - int(time)) * 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
+def timeformat_vtt(time):
+    hours = time // 3600
+    minutes = (time - hours * 3600) // 60
+    seconds = time - hours * 3600 - minutes * 60
+    milliseconds = (time - int(time)) * 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
+def write_file(subtitle, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(subtitle)
+def get_srt(segments):
+    output = ""
+    for i, segment in enumerate(segments):
+        output += f"{i + 1}\n"
+        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n\n"
+    return output
+def get_vtt(segments):
+    output = "WebVTT\n\n"
+    for i, segment in enumerate(segments):
+        output += f"{i + 1}\n"
+        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n\n"
+    return output
+def get_txt(segments):
+    output = ""
+    for i, segment in enumerate(segments):
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n"
+    return output
+def parse_srt(file_path):
+    """Reads SRT file and returns as dict"""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        srt_data = file.read()
+    data = []
+    blocks = srt_data.split('\n\n')
+    for block in blocks:
+        if block.strip() != '':
+            lines = block.strip().split('\n')
+            index = lines[0]
+            timestamp = lines[1]
+            sentence = ' '.join(lines[2:])
+            data.append({
+                "index": index,
+                "timestamp": timestamp,
+                "sentence": sentence
+            })
+    return data
+def parse_vtt(file_path):
+    """Reads WebVTT file and returns as dict"""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        webvtt_data = file.read()
+    data = []
+    blocks = webvtt_data.split('\n\n')
+    for block in blocks:
+        if block.strip() != '' and not block.strip().startswith("WebVTT"):
+            lines = block.strip().split('\n')
+            index = lines[0]
+            timestamp = lines[1]
+            sentence = ' '.join(lines[2:])
+            data.append({
+                "index": index,
+                "timestamp": timestamp,
+                "sentence": sentence
+            })
+    return data
+def get_serialized_srt(dicts):
+    output = ""
+    for dic in dicts:
+        output += f'{dic["index"]}\n'
+        output += f'{dic["timestamp"]}\n'
+        output += f'{dic["sentence"]}\n\n'
+    return output
+def get_serialized_vtt(dicts):
+    output = "WebVTT\n\n"
+    for dic in dicts:
+        output += f'{dic["index"]}\n'
+        output += f'{dic["timestamp"]}\n'
+        output += f'{dic["sentence"]}\n\n'
+    return output
+def safe_filename(name):
+    from app import _args
+    INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
+    safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
+    if not _args.colab:
+        return safe_name
+    # Truncate the filename if it exceeds the max_length (20)
+    if len(safe_name) > 20:
+        file_extension = safe_name.split('.')[-1]
+        if len(file_extension) + 1 < 20:
+            truncated_name = safe_name[:20 - len(file_extension) - 1]
+            safe_name = truncated_name + '.' + file_extension
+        else:
+            safe_name = safe_name[:20]
+    return safe_name

modules/translation_base.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import torch
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import List
+from datetime import datetime
+from modules.whisper_parameter import *
+from modules.subtitle_manager import *
+class TranslationBase(ABC):
+    def __init__(self,
+                 model_dir: str):
+        super().__init__()
+        self.model = None
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.current_model_size = None
+        self.device = self.get_device()
+    @abstractmethod
+    def translate(self,
+                  text: str
+                  ):
+        pass
+    @abstractmethod
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress
+                     ):
+        pass
+    def translate_file(self,
+                       fileobjs: list,
+                       model_size: str,
+                       src_lang: str,
+                       tgt_lang: str,
+                       add_timestamp: bool,
+                       progress=gr.Progress()) -> list:
+        """
+        Translate subtitle file from source language to target language
+        Parameters
+        ----------
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        src_lang: str
+            Source language of the file to translate from gr.Dropdown()
+        tgt_lang: str
+            Target language of the file to translate from gr.Dropdown()
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model(model_size=model_size,
+                              src_lang=src_lang,
+                              tgt_lang=tgt_lang,
+                              progress=progress)
+            files_info = {}
+            for fileobj in fileobjs:
+                file_path = fileobj.name
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+                if file_ext == ".srt":
+                    parsed_dicts = parse_srt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"])
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_srt(parsed_dicts)
+                    timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    if add_timestamp:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
+                    else:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}.srt")
+                elif file_ext == ".vtt":
+                    parsed_dicts = parse_vtt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"])
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_vtt(parsed_dicts)
+                    timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    if add_timestamp:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
+                    else:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}.vtt")
+                write_file(subtitle, output_path)
+                files_info[file_name] = subtitle
+            total_result = ''
+            for file_name, subtitle in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{subtitle}'
+            gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+            return [gr_str, output_path]
+        except Exception as e:
+            print(f"Error: {str(e)}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([fileobj.name for fileobj in fileobjs])
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)

modules/whisper_Inference.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import whisper
+import gradio as gr
+import time
+import os
+from typing import BinaryIO, Union, Tuple, List
+import numpy as np
+import torch
+from modules.whisper_base import WhisperBase
+from modules.whisper_parameter import *
+class WhisperInference(WhisperBase):
+    def __init__(self):
+        super().__init__(
+            model_dir=os.path.join("models", "Whisper")
+        )
+    def transcribe(self,
+                   audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ) -> Tuple[List[dict], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperValues(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        if params.lang == "Automatic Detection":
+            params.lang = None
+        def progress_callback(progress_value):
+            progress(progress_value, desc="Transcribing..")
+        segments_result = self.model.transcribe(audio=audio,
+                                                language=params.lang,
+                                                verbose=False,
+                                                beam_size=params.beam_size,
+                                                logprob_threshold=params.log_prob_threshold,
+                                                no_speech_threshold=params.no_speech_threshold,
+                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+                                                fp16=True if params.compute_type == "float16" else False,
+                                                best_of=params.best_of,
+                                                patience=params.patience,
+                                                temperature=params.temperature,
+                                                compression_ratio_threshold=params.compression_ratio_threshold,
+                                                progress_callback=progress_callback,)["segments"]
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress,
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = whisper.load_model(
+            name=model_size,
+            device=self.device,
+            download_root=self.model_dir
+        )

modules/whisper_base.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import os
+import torch
+from typing import List
+import whisper
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Union, Tuple, List
+import numpy as np
+from datetime import datetime
+from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
+from modules.youtube_manager import get_ytdata, get_ytaudio
+from modules.whisper_parameter import *
+class WhisperBase(ABC):
+    def __init__(self,
+                 model_dir: str):
+        self.model = None
+        self.current_model_size = None
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.available_models = whisper.available_models()
+        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
+        self.device = self.get_device()
+        self.available_compute_types = ["float16", "float32"]
+        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+    @abstractmethod
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ):
+        pass
+    @abstractmethod
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress
+                     ):
+        pass
+    def transcribe_file(self,
+                        files: list,
+                        file_format: str,
+                        add_timestamp: bool,
+                        progress=gr.Progress(),
+                        *whisper_params,
+                        ) -> list:
+        """
+        Write subtitle file from Files
+        Parameters
+        ----------
+        files: list
+            List of files to transcribe from gr.Files()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            files_info = {}
+            for file in files:
+                transcribed_segments, time_for_task = self.transcribe(
+                    file.name,
+                    progress,
+                    *whisper_params,
+                )
+                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
+                file_name = safe_filename(file_name)
+                subtitle, file_path = self.generate_and_write_file(
+                    file_name=file_name,
+                    transcribed_segments=transcribed_segments,
+                    add_timestamp=add_timestamp,
+                    file_format=file_format
+                )
+                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
+            total_result = ''
+            total_time = 0
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
+                total_time += info["time_for_task"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            self.release_cuda_memory()
+            if not files:
+                self.remove_input_files([file.name for file in files])
+    def transcribe_mic(self,
+                       mic_audio: str,
+                       file_format: str,
+                       progress=gr.Progress(),
+                       *whisper_params,
+                       ) -> list:
+        """
+        Write subtitle file from microphone
+        Parameters
+        ----------
+        mic_audio: str
+            Audio file path from gr.Microphone()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            progress(0, desc="Loading Audio..")
+            transcribed_segments, time_for_task = self.transcribe(
+                mic_audio,
+                progress,
+                *whisper_params,
+            )
+            progress(1, desc="Completed!")
+            subtitle, result_file_path = self.generate_and_write_file(
+                file_name="Mic",
+                transcribed_segments=transcribed_segments,
+                add_timestamp=True,
+                file_format=file_format
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([mic_audio])
+    def transcribe_youtube(self,
+                           youtube_link: str,
+                           file_format: str,
+                           add_timestamp: bool,
+                           progress=gr.Progress(),
+                           *whisper_params,
+                           ) -> list:
+        """
+        Write subtitle file from Youtube
+        Parameters
+        ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
+            audio = get_ytaudio(yt)
+            transcribed_segments, time_for_task = self.transcribe(
+                audio,
+                progress,
+                *whisper_params,
+            )
+            progress(1, desc="Completed!")
+            file_name = safe_filename(yt.title)
+            subtitle, result_file_path = self.generate_and_write_file(
+                file_name=file_name,
+                transcribed_segments=transcribed_segments,
+                add_timestamp=add_timestamp,
+                file_format=file_format
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            try:
+                if 'yt' not in locals():
+                    yt = get_ytdata(youtube_link)
+                    file_path = get_ytaudio(yt)
+                else:
+                    file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
+                self.remove_input_files([file_path])
+            except Exception as cleanup_error:
+                pass
+    @staticmethod
+    def generate_and_write_file(file_name: str,
+                                transcribed_segments: list,
+                                add_timestamp: bool,
+                                file_format: str,
+                                ) -> str:
+        """
+        Writes subtitle file
+        Parameters
+        ----------
+        file_name: str
+            Output file name
+        transcribed_segments: list
+            Text segments transcribed from audio
+        add_timestamp: bool
+            Determines whether to add a timestamp to the end of the filename.
+        file_format: str
+            File format to write. Supported formats: [SRT, WebVTT, txt]
+        Returns
+        ----------
+        content: str
+            Result of the transcription
+        output_path: str
+            output file path
+        """
+        timestamp = datetime.now().strftime("%m%d%H%M%S")
+        if add_timestamp:
+            output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
+        else:
+            output_path = os.path.join("outputs", f"{file_name}")
+        if file_format == "SRT":
+            content = get_srt(transcribed_segments)
+            output_path += '.srt'
+            write_file(content, output_path)
+        elif file_format == "WebVTT":
+            content = get_vtt(transcribed_segments)
+            output_path += '.vtt'
+            write_file(content, output_path)
+        elif file_format == "txt":
+            content = get_txt(transcribed_segments)
+            output_path += '.txt'
+            write_file(content, output_path)
+        return content, output_path
+    @staticmethod
+    def format_time(elapsed_time: float) -> str:
+        """
+        Get {hours} {minutes} {seconds} time format string
+        Parameters
+        ----------
+        elapsed_time: str
+            Elapsed time for transcription
+        Returns
+        ----------
+        Time format string
+        """
+        hours, rem = divmod(elapsed_time, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_str = ""
+        if hours:
+            time_str += f"{hours} hours "
+        if minutes:
+            time_str += f"{minutes} minutes "
+        seconds = round(seconds)
+        time_str += f"{seconds} seconds"
+        return time_str.strip()
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)

modules/whisper_parameter.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from dataclasses import dataclass, fields
+import gradio as gr
+from typing import Optional
+@dataclass
+class WhisperGradioComponents:
+    model_size: gr.Dropdown
+    lang: gr.Dropdown
+    is_translate: gr.Checkbox
+    beam_size: gr.Number
+    log_prob_threshold: gr.Number
+    no_speech_threshold: gr.Number
+    compute_type: gr.Dropdown
+    best_of: gr.Number
+    patience: gr.Number
+    condition_on_previous_text: gr.Checkbox
+    initial_prompt: gr.Textbox
+    temperature: gr.Slider
+    compression_ratio_threshold: gr.Number
+    vad_filter: gr.Checkbox
+    threshold: gr.Slider
+    min_speech_duration_ms: gr.Number
+    max_speech_duration_s: gr.Number
+    min_silence_duration_ms: gr.Number
+    window_size_sample: gr.Number
+    speech_pad_ms: gr.Number
+    """
+    A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
+    See more about Gradio pre-processing: https://www.gradio.app/docs/components
+    Attributes
+    ----------
+    model_size: gr.Dropdown
+        Whisper model size.
+    lang: gr.Dropdown
+        Source language of the file to transcribe.
+    is_translate: gr.Checkbox
+        Boolean value that determines whether to translate to English.
+        It's Whisper's feature to translate speech from another language directly into English end-to-end.
+    beam_size: gr.Number
+        Int value that is used for decoding option.
+    log_prob_threshold: gr.Number
+        If the average log probability over sampled tokens is below this value, treat as failed.
+    no_speech_threshold: gr.Number
+        If the no_speech probability is higher than this value AND
+        the average log probability over sampled tokens is below `log_prob_threshold`,
+        consider the segment as silent.
+    compute_type: gr.Dropdown
+        compute type for transcription.
+        see more info : https://opennmt.net/CTranslate2/quantization.html
+    best_of: gr.Number
+        Number of candidates when sampling with non-zero temperature.
+    patience: gr.Number
+        Beam search patience factor.
+    condition_on_previous_text: gr.Checkbox
+        if True, the previous output of the model is provided as a prompt for the next window;
+        disabling may make the text inconsistent across windows, but the model becomes less prone to
+        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
+    initial_prompt: gr.Textbox
+        Optional text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
+    temperature: gr.Slider
+        Temperature for sampling. It can be a tuple of temperatures,
+        which will be successively used upon failures according to either
+        `compression_ratio_threshold` or `log_prob_threshold`.
+    compression_ratio_threshold: gr.Number
+        If the gzip compression ratio is above this value, treat as failed
+    vad_filter: gr.Checkbox
+        Enable the voice activity detection (VAD) to filter out parts of the audio
+        without speech. This step is using the Silero VAD model
+        https://github.com/snakers4/silero-vad.
+    threshold: gr.Slider
+        This parameter is related with Silero VAD. Speech threshold.
+        Silero VAD outputs speech probabilities for each audio chunk,
+        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
+        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+    min_speech_duration_ms: gr.Number
+        This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
+    max_speech_duration_s: gr.Number
+        This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
+        than max_speech_duration_s will be split at the timestamp of the last silence that
+        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
+        split aggressively just before max_speech_duration_s.
+    min_silence_duration_ms: gr.Number
+        This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
+        before separating it
+    window_size_samples: gr.Number
+        This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
+        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
+        Values other than these may affect model performance!!
+    speech_pad_ms: gr.Number
+        This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
+    """
+    def to_list(self) -> list:
+        """
+        Converts the data class attributes into a list. Use "before" Gradio pre-processing.
+        See more about Gradio pre-processing: : https://www.gradio.app/docs/components
+        Returns
+        ----------
+        A list of Gradio components
+        """
+        return [getattr(self, f.name) for f in fields(self)]
+@dataclass
+class WhisperValues:
+    model_size: str
+    lang: str
+    is_translate: bool
+    beam_size: int
+    log_prob_threshold: float
+    no_speech_threshold: float
+    compute_type: str
+    best_of: int
+    patience: float
+    condition_on_previous_text: bool
+    initial_prompt: Optional[str]
+    temperature: float
+    compression_ratio_threshold: float
+    vad_filter: bool
+    threshold: float
+    min_speech_duration_ms: int
+    max_speech_duration_s: float
+    min_silence_duration_ms: int
+    window_size_samples: int
+    speech_pad_ms: int
+    """
+    A data class to use Whisper parameters. Use "after" Gradio pre-processing.
+    See more about Gradio pre-processing: : https://www.gradio.app/docs/components
+    """

modules/youtube_manager.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from pytube import YouTube
+import os
+def get_ytdata(link):
+    return YouTube(link)
+def get_ytmetas(link):
+    yt = YouTube(link)
+    return yt.thumbnail_url, yt.title, yt.description
+def get_ytaudio(ytdata: YouTube):
+    return ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+--extra-index-url https://download.pytorch.org/whl/cu121
+torch
+git+https://github.com/jhj0517/jhj0517-whisper.git
+faster-whisper==1.0.2
+transformers
+gradio==4.29.0
+pytube

ui/__init__.py ADDED Viewed

File without changes

ui/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (147 Bytes). View file

ui/__pycache__/htmls.cpython-312.pyc ADDED Viewed

Binary file (2.01 kB). View file

ui/htmls.py ADDED Viewed

	@@ -0,0 +1,97 @@

+CSS = """
+.bmc-button {
+    padding: 2px 5px;
+    border-radius: 5px;
+    background-color: #FF813F;
+    color: white;
+    box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
+    text-decoration: none;
+    display: inline-block;
+    font-size: 20px;
+    margin: 2px;
+    cursor: pointer;
+    -webkit-transition: background-color 0.3s ease;
+    -ms-transition: background-color 0.3s ease;
+    transition: background-color 0.3s ease;
+}
+.bmc-button:hover,
+.bmc-button:active,
+.bmc-button:focus {
+    background-color: #FF5633;
+}
+.markdown {
+    margin-bottom: 0;
+    padding-bottom: 0;
+}
+.tabs {
+    margin-top: 0;
+    padding-top: 0;
+}
+#md_project a {
+  color: black;
+  text-decoration: none;
+}
+#md_project a:hover {
+  text-decoration: underline;
+}
+"""
+MARKDOWN = """
+### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
+"""
+NLLB_VRAM_TABLE = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <style>
+    table {
+      border-collapse: collapse;
+      width: 100%;
+    }
+    th, td {
+      border: 1px solid #dddddd;
+      text-align: left;
+      padding: 8px;
+    }
+    th {
+      background-color: #f2f2f2;
+    }
+  </style>
+</head>
+<body>
+<details>
+  <summary>VRAM usage for each model</summary>
+  <table>
+    <thead>
+      <tr>
+        <th>Model name</th>
+        <th>Required VRAM</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>nllb-200-3.3B</td>
+        <td>~16GB</td>
+      </tr>
+      <tr>
+        <td>nllb-200-1.3B</td>
+        <td>~8GB</td>
+      </tr>
+      <tr>
+        <td>nllb-200-distilled-600M</td>
+        <td>~4GB</td>
+      </tr>
+    </tbody>
+  </table>
+  <p><strong>Note:</strong> Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.</p>
+</details>
+</body>
+</html>
+"""