gobeldan commited on
Commit
1dbb0f9
1 Parent(s): be4277e

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +136 -0
  2. languages.py +146 -0
  3. requirements.txt +2 -0
  4. subtitle_manager.py +57 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import logging
3
+ import os
4
+ import gradio as gr
5
+ from faster_whisper import WhisperModel
6
+ from languages import get_language_names, get_language_from_name
7
+ from subtitle_manager import Subtitle
8
+ from pathlib import Path
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ last_model = None
12
+ model = None
13
+
14
+ def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
15
+ chunk_length, compute_type, beam_size, vad_filter, min_silence_duration_ms,
16
+ progress=gr.Progress()):
17
+ global last_model
18
+ global model
19
+
20
+ progress(0, desc="Loading Audio..")
21
+ logging.info(f"languageName:{languageName}")
22
+ logging.info(f"urlData:{urlData}")
23
+ logging.info(f"multipleFiles:{multipleFiles}")
24
+ logging.info(f"microphoneData:{microphoneData}")
25
+ logging.info(f"task: {task}")
26
+ logging.info(f"chunk_length: {chunk_length}")
27
+
28
+ if last_model == None or modelName != last_model:
29
+ logging.info("first or new model")
30
+ progress(0.1, desc="Loading Model..")
31
+ model = None
32
+ model = WhisperModel(modelName, device="cuda",compute_type=compute_type, cpu_threads=os.cpu_count(),)#device="auto", compute_type="float16"
33
+ print('loaded')
34
+ else:
35
+ logging.info("Model not changed")
36
+ last_model = modelName
37
+
38
+ srt_sub = Subtitle("srt")
39
+ # vtt_sub = Subtitle("vtt")
40
+ # txt_sub = Subtitle("txt")
41
+
42
+ files = []
43
+ if multipleFiles:
44
+ files+=multipleFiles
45
+ if urlData:
46
+ files.append(urlData)
47
+ if microphoneData:
48
+ files.append(microphoneData)
49
+ logging.info(files)
50
+
51
+ languageName = None if languageName == "Automatic Detection" else get_language_from_name(languageName).code
52
+
53
+ files_out = []
54
+ vtt=""
55
+ txt=""
56
+ for file in progress.tqdm(files, desc="Working..."):
57
+
58
+ start_time = time.time()
59
+ segments, info = model.transcribe(
60
+ file,
61
+ beam_size=beam_size,
62
+ vad_filter=vad_filter,
63
+ language=languageName,
64
+ vad_parameters=dict(min_silence_duration_ms=min_silence_duration_ms),
65
+ # max_new_tokens=128,
66
+ condition_on_previous_text=False,
67
+ chunk_length=chunk_length,
68
+ )
69
+
70
+ file_name = Path(file).stem
71
+ files_out_srt = srt_sub.write_subtitle(segments, file_name, modelName, progress)
72
+ # txt = txt_sub.get_subtitle(segments, progress)
73
+ logging.info(print(f"transcribe: {time.time() - start_time} sec."))
74
+ files_out += [files_out_srt]
75
+
76
+ return files_out, vtt, txt
77
+
78
+
79
+ with gr.Blocks(title="Fast Whisper WebUI") as demo:
80
+ description = "faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is a fast inference engine for Transformer models."
81
+ article = "Read the [documentation here](https://github.com/SYSTRAN/faster-whisper)."
82
+ whisper_models = [
83
+ "tiny", "tiny.en",
84
+ "base", "base.en",
85
+ "small", "small.en", "distil-small.en",
86
+ "medium", "medium.en", "distil-medium.en",
87
+ "large",
88
+ "large-v1",
89
+ "large-v2", "distil-large-v2",
90
+ "large-v3",
91
+ ]
92
+ compute_types = [
93
+ "auto", "default", "int8", "int8_float32",
94
+ "int8_float16", "int8_bfloat16", "int16",
95
+ "float16", "float32", "bfloat16"
96
+ ]
97
+
98
+
99
+ # settings
100
+ # cant put Dropdown in inputs
101
+ # with gr.Accordion("Settings", open=False):
102
+ # task = gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
103
+ # chunk_length = gr.Number(label='chunk_length',value=30, interactive = True),
104
+ # compute_type = gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
105
+ # beam_size = gr.Number(label='beam_size',value=5, interactive = True),
106
+ # vad_filter = gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
107
+ # vad_min_silence_duration_ms = gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),
108
+
109
+ gr.Interface(
110
+ fn=transcribe_webui_simple_progress,
111
+ description=description,
112
+ article=article,
113
+ inputs=[
114
+ gr.Dropdown(choices=whisper_models, value="distil-large-v2", label="Model", info="Select whisper model", interactive = True,),
115
+ gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", info="Select audio voice language", interactive = True,),
116
+ gr.Text(label="URL", info="(YouTube, etc.)", interactive = True),
117
+ gr.File(label="Upload Files", file_count="multiple"),
118
+ gr.Audio(sources=["microphone"], type="filepath", label="Microphone Input"),
119
+
120
+ gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
121
+ gr.Number(label='chunk_length',value=30, interactive = True),
122
+ gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
123
+ gr.Number(label='beam_size',value=5, interactive = True),
124
+ gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
125
+ gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),
126
+ ],
127
+ outputs=[
128
+ gr.File(label="Download"),
129
+ gr.Text(label="Transcription"),
130
+ gr.Text(label="Segments"),
131
+ ]
132
+ )
133
+
134
+ if __name__ == "__main__":
135
+ demo.launch()
136
+
languages.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Language():
2
+ def __init__(self, code, name):
3
+ self.code = code
4
+ self.name = name
5
+
6
+ def __str__(self):
7
+ return "Language(code={}, name={})".format(self.code, self.name)
8
+
9
+ LANGUAGES = [
10
+ Language('en', 'English'),
11
+ Language('zh', 'Chinese'),
12
+ Language('de', 'German'),
13
+ Language('es', 'Spanish'),
14
+ Language('ru', 'Russian'),
15
+ Language('ko', 'Korean'),
16
+ Language('fr', 'French'),
17
+ Language('ja', 'Japanese'),
18
+ Language('pt', 'Portuguese'),
19
+ Language('tr', 'Turkish'),
20
+ Language('pl', 'Polish'),
21
+ Language('ca', 'Catalan'),
22
+ Language('nl', 'Dutch'),
23
+ Language('ar', 'Arabic'),
24
+ Language('sv', 'Swedish'),
25
+ Language('it', 'Italian'),
26
+ Language('id', 'Indonesian'),
27
+ Language('hi', 'Hindi'),
28
+ Language('fi', 'Finnish'),
29
+ Language('vi', 'Vietnamese'),
30
+ Language('he', 'Hebrew'),
31
+ Language('uk', 'Ukrainian'),
32
+ Language('el', 'Greek'),
33
+ Language('ms', 'Malay'),
34
+ Language('cs', 'Czech'),
35
+ Language('ro', 'Romanian'),
36
+ Language('da', 'Danish'),
37
+ Language('hu', 'Hungarian'),
38
+ Language('ta', 'Tamil'),
39
+ Language('no', 'Norwegian'),
40
+ Language('th', 'Thai'),
41
+ Language('ur', 'Urdu'),
42
+ Language('hr', 'Croatian'),
43
+ Language('bg', 'Bulgarian'),
44
+ Language('lt', 'Lithuanian'),
45
+ Language('la', 'Latin'),
46
+ Language('mi', 'Maori'),
47
+ Language('ml', 'Malayalam'),
48
+ Language('cy', 'Welsh'),
49
+ Language('sk', 'Slovak'),
50
+ Language('te', 'Telugu'),
51
+ Language('fa', 'Persian'),
52
+ Language('lv', 'Latvian'),
53
+ Language('bn', 'Bengali'),
54
+ Language('sr', 'Serbian'),
55
+ Language('az', 'Azerbaijani'),
56
+ Language('sl', 'Slovenian'),
57
+ Language('kn', 'Kannada'),
58
+ Language('et', 'Estonian'),
59
+ Language('mk', 'Macedonian'),
60
+ Language('br', 'Breton'),
61
+ Language('eu', 'Basque'),
62
+ Language('is', 'Icelandic'),
63
+ Language('hy', 'Armenian'),
64
+ Language('ne', 'Nepali'),
65
+ Language('mn', 'Mongolian'),
66
+ Language('bs', 'Bosnian'),
67
+ Language('kk', 'Kazakh'),
68
+ Language('sq', 'Albanian'),
69
+ Language('sw', 'Swahili'),
70
+ Language('gl', 'Galician'),
71
+ Language('mr', 'Marathi'),
72
+ Language('pa', 'Punjabi'),
73
+ Language('si', 'Sinhala'),
74
+ Language('km', 'Khmer'),
75
+ Language('sn', 'Shona'),
76
+ Language('yo', 'Yoruba'),
77
+ Language('so', 'Somali'),
78
+ Language('af', 'Afrikaans'),
79
+ Language('oc', 'Occitan'),
80
+ Language('ka', 'Georgian'),
81
+ Language('be', 'Belarusian'),
82
+ Language('tg', 'Tajik'),
83
+ Language('sd', 'Sindhi'),
84
+ Language('gu', 'Gujarati'),
85
+ Language('am', 'Amharic'),
86
+ Language('yi', 'Yiddish'),
87
+ Language('lo', 'Lao'),
88
+ Language('uz', 'Uzbek'),
89
+ Language('fo', 'Faroese'),
90
+ Language('ht', 'Haitian creole'),
91
+ Language('ps', 'Pashto'),
92
+ Language('tk', 'Turkmen'),
93
+ Language('nn', 'Nynorsk'),
94
+ Language('mt', 'Maltese'),
95
+ Language('sa', 'Sanskrit'),
96
+ Language('lb', 'Luxembourgish'),
97
+ Language('my', 'Myanmar'),
98
+ Language('bo', 'Tibetan'),
99
+ Language('tl', 'Tagalog'),
100
+ Language('mg', 'Malagasy'),
101
+ Language('as', 'Assamese'),
102
+ Language('tt', 'Tatar'),
103
+ Language('haw', 'Hawaiian'),
104
+ Language('ln', 'Lingala'),
105
+ Language('ha', 'Hausa'),
106
+ Language('ba', 'Bashkir'),
107
+ Language('jw', 'Javanese'),
108
+ Language('su', 'Sundanese')
109
+ ]
110
+
111
+ _TO_LANGUAGE_CODE = {
112
+ **{language.code: language for language in LANGUAGES},
113
+ "burmese": "my",
114
+ "valencian": "ca",
115
+ "flemish": "nl",
116
+ "haitian": "ht",
117
+ "letzeburgesch": "lb",
118
+ "pushto": "ps",
119
+ "panjabi": "pa",
120
+ "moldavian": "ro",
121
+ "moldovan": "ro",
122
+ "sinhalese": "si",
123
+ "castilian": "es",
124
+ }
125
+
126
+ _FROM_LANGUAGE_NAME = {
127
+ **{language.name.lower(): language for language in LANGUAGES}
128
+ }
129
+
130
+ def get_language_from_code(language_code, default=None) -> Language:
131
+ """Return the language name from the language code."""
132
+ return _TO_LANGUAGE_CODE.get(language_code, default)
133
+
134
+ def get_language_from_name(language, default=None) -> Language:
135
+ """Return the language code from the language name."""
136
+ return _FROM_LANGUAGE_NAME.get(language.lower() if language else None, default)
137
+
138
+ def get_language_names():
139
+ """Return a list of language names."""
140
+ return [language.name for language in LANGUAGES]
141
+
142
+ if __name__ == "__main__":
143
+ # Test lookup
144
+ print(get_language_from_code('en'))
145
+ print(get_language_from_name('English'))
146
+ print(get_language_names())
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==4.16.0
2
+ faster-whisper @ git+https://github.com/guillaumekln/faster-whisper@master
subtitle_manager.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ class Subtitle():
4
+ def __init__(self,ext="srt"):
5
+ sub_dict = {
6
+ "srt":{
7
+ "coma": ",",
8
+ "header": "",
9
+ "format": lambda i,start,end,text : f"{i + 1}\n{self.timeformat(start)} --> {self.timeformat(end if end != None else start)}\n{text}\n\n",
10
+ },
11
+ "vtt":{
12
+ "coma": ".",
13
+ "header": "WebVTT\n\n",
14
+ "format": lambda i,start,end,text : f"{self.timeformat(start)} --> {self.timeformat(end if end != None else start)}\n{text}\n\n",
15
+ },
16
+ "txt":{
17
+ "coma": "",
18
+ "header": "",
19
+ "format": lambda i,start,end,text : f"{text}\n",
20
+ },
21
+ }
22
+
23
+ self.ext = ext
24
+ self.coma = sub_dict[ext]["coma"]
25
+ self.header = sub_dict[ext]["header"]
26
+ self.format = sub_dict[ext]["format"]
27
+
28
+ def timeformat(self,time):
29
+ hours = time // 3600
30
+ minutes = (time - hours * 3600) // 60
31
+ seconds = time - hours * 3600 - minutes * 60
32
+ milliseconds = (time - int(time)) * 1000
33
+ return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}{self.coma}{int(milliseconds):03d}"
34
+
35
+ def get_subtitle(self,segments, progress):
36
+ output = self.header
37
+ # for i, segment in enumerate(segments):
38
+ for i, segment in enumerate(progress.tqdm(segments, desc="Whisper working...")):
39
+ text = segment.text
40
+ if text.startswith(' '):
41
+ text = text[1:]
42
+ try:
43
+ result = self.format(i,segment.start, segment.end, text)
44
+ output += result
45
+ # logging.info(result)
46
+ except Exception as e:
47
+ logging.error(e,segment)
48
+ return output
49
+
50
+ def write_subtitle(self, segments, output_file, model, progress):
51
+ # output_file = output_file.split('.')[0]
52
+ output_file += f".({model})."+self.ext
53
+ subtitle = self.get_subtitle(segments,progress)
54
+
55
+ with open(output_file, 'w', encoding='utf-8') as f:
56
+ f.write(subtitle)
57
+ return output_file