Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- app.py +136 -0
- languages.py +146 -0
- requirements.txt +2 -0
- subtitle_manager.py +57 -0
app.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import gradio as gr
|
5 |
+
from faster_whisper import WhisperModel
|
6 |
+
from languages import get_language_names, get_language_from_name
|
7 |
+
from subtitle_manager import Subtitle
|
8 |
+
from pathlib import Path
|
9 |
+
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
last_model = None
|
12 |
+
model = None
|
13 |
+
|
14 |
+
def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
15 |
+
chunk_length, compute_type, beam_size, vad_filter, min_silence_duration_ms,
|
16 |
+
progress=gr.Progress()):
|
17 |
+
global last_model
|
18 |
+
global model
|
19 |
+
|
20 |
+
progress(0, desc="Loading Audio..")
|
21 |
+
logging.info(f"languageName:{languageName}")
|
22 |
+
logging.info(f"urlData:{urlData}")
|
23 |
+
logging.info(f"multipleFiles:{multipleFiles}")
|
24 |
+
logging.info(f"microphoneData:{microphoneData}")
|
25 |
+
logging.info(f"task: {task}")
|
26 |
+
logging.info(f"chunk_length: {chunk_length}")
|
27 |
+
|
28 |
+
if last_model == None or modelName != last_model:
|
29 |
+
logging.info("first or new model")
|
30 |
+
progress(0.1, desc="Loading Model..")
|
31 |
+
model = None
|
32 |
+
model = WhisperModel(modelName, device="cuda",compute_type=compute_type, cpu_threads=os.cpu_count(),)#device="auto", compute_type="float16"
|
33 |
+
print('loaded')
|
34 |
+
else:
|
35 |
+
logging.info("Model not changed")
|
36 |
+
last_model = modelName
|
37 |
+
|
38 |
+
srt_sub = Subtitle("srt")
|
39 |
+
# vtt_sub = Subtitle("vtt")
|
40 |
+
# txt_sub = Subtitle("txt")
|
41 |
+
|
42 |
+
files = []
|
43 |
+
if multipleFiles:
|
44 |
+
files+=multipleFiles
|
45 |
+
if urlData:
|
46 |
+
files.append(urlData)
|
47 |
+
if microphoneData:
|
48 |
+
files.append(microphoneData)
|
49 |
+
logging.info(files)
|
50 |
+
|
51 |
+
languageName = None if languageName == "Automatic Detection" else get_language_from_name(languageName).code
|
52 |
+
|
53 |
+
files_out = []
|
54 |
+
vtt=""
|
55 |
+
txt=""
|
56 |
+
for file in progress.tqdm(files, desc="Working..."):
|
57 |
+
|
58 |
+
start_time = time.time()
|
59 |
+
segments, info = model.transcribe(
|
60 |
+
file,
|
61 |
+
beam_size=beam_size,
|
62 |
+
vad_filter=vad_filter,
|
63 |
+
language=languageName,
|
64 |
+
vad_parameters=dict(min_silence_duration_ms=min_silence_duration_ms),
|
65 |
+
# max_new_tokens=128,
|
66 |
+
condition_on_previous_text=False,
|
67 |
+
chunk_length=chunk_length,
|
68 |
+
)
|
69 |
+
|
70 |
+
file_name = Path(file).stem
|
71 |
+
files_out_srt = srt_sub.write_subtitle(segments, file_name, modelName, progress)
|
72 |
+
# txt = txt_sub.get_subtitle(segments, progress)
|
73 |
+
logging.info(print(f"transcribe: {time.time() - start_time} sec."))
|
74 |
+
files_out += [files_out_srt]
|
75 |
+
|
76 |
+
return files_out, vtt, txt
|
77 |
+
|
78 |
+
|
79 |
+
with gr.Blocks(title="Fast Whisper WebUI") as demo:
|
80 |
+
description = "faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is a fast inference engine for Transformer models."
|
81 |
+
article = "Read the [documentation here](https://github.com/SYSTRAN/faster-whisper)."
|
82 |
+
whisper_models = [
|
83 |
+
"tiny", "tiny.en",
|
84 |
+
"base", "base.en",
|
85 |
+
"small", "small.en", "distil-small.en",
|
86 |
+
"medium", "medium.en", "distil-medium.en",
|
87 |
+
"large",
|
88 |
+
"large-v1",
|
89 |
+
"large-v2", "distil-large-v2",
|
90 |
+
"large-v3",
|
91 |
+
]
|
92 |
+
compute_types = [
|
93 |
+
"auto", "default", "int8", "int8_float32",
|
94 |
+
"int8_float16", "int8_bfloat16", "int16",
|
95 |
+
"float16", "float32", "bfloat16"
|
96 |
+
]
|
97 |
+
|
98 |
+
|
99 |
+
# settings
|
100 |
+
# cant put Dropdown in inputs
|
101 |
+
# with gr.Accordion("Settings", open=False):
|
102 |
+
# task = gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
|
103 |
+
# chunk_length = gr.Number(label='chunk_length',value=30, interactive = True),
|
104 |
+
# compute_type = gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
|
105 |
+
# beam_size = gr.Number(label='beam_size',value=5, interactive = True),
|
106 |
+
# vad_filter = gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
|
107 |
+
# vad_min_silence_duration_ms = gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),
|
108 |
+
|
109 |
+
gr.Interface(
|
110 |
+
fn=transcribe_webui_simple_progress,
|
111 |
+
description=description,
|
112 |
+
article=article,
|
113 |
+
inputs=[
|
114 |
+
gr.Dropdown(choices=whisper_models, value="distil-large-v2", label="Model", info="Select whisper model", interactive = True,),
|
115 |
+
gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", info="Select audio voice language", interactive = True,),
|
116 |
+
gr.Text(label="URL", info="(YouTube, etc.)", interactive = True),
|
117 |
+
gr.File(label="Upload Files", file_count="multiple"),
|
118 |
+
gr.Audio(sources=["microphone"], type="filepath", label="Microphone Input"),
|
119 |
+
|
120 |
+
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
|
121 |
+
gr.Number(label='chunk_length',value=30, interactive = True),
|
122 |
+
gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
|
123 |
+
gr.Number(label='beam_size',value=5, interactive = True),
|
124 |
+
gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
|
125 |
+
gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),
|
126 |
+
],
|
127 |
+
outputs=[
|
128 |
+
gr.File(label="Download"),
|
129 |
+
gr.Text(label="Transcription"),
|
130 |
+
gr.Text(label="Segments"),
|
131 |
+
]
|
132 |
+
)
|
133 |
+
|
134 |
+
if __name__ == "__main__":
|
135 |
+
demo.launch()
|
136 |
+
|
languages.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Language():
|
2 |
+
def __init__(self, code, name):
|
3 |
+
self.code = code
|
4 |
+
self.name = name
|
5 |
+
|
6 |
+
def __str__(self):
|
7 |
+
return "Language(code={}, name={})".format(self.code, self.name)
|
8 |
+
|
9 |
+
LANGUAGES = [
|
10 |
+
Language('en', 'English'),
|
11 |
+
Language('zh', 'Chinese'),
|
12 |
+
Language('de', 'German'),
|
13 |
+
Language('es', 'Spanish'),
|
14 |
+
Language('ru', 'Russian'),
|
15 |
+
Language('ko', 'Korean'),
|
16 |
+
Language('fr', 'French'),
|
17 |
+
Language('ja', 'Japanese'),
|
18 |
+
Language('pt', 'Portuguese'),
|
19 |
+
Language('tr', 'Turkish'),
|
20 |
+
Language('pl', 'Polish'),
|
21 |
+
Language('ca', 'Catalan'),
|
22 |
+
Language('nl', 'Dutch'),
|
23 |
+
Language('ar', 'Arabic'),
|
24 |
+
Language('sv', 'Swedish'),
|
25 |
+
Language('it', 'Italian'),
|
26 |
+
Language('id', 'Indonesian'),
|
27 |
+
Language('hi', 'Hindi'),
|
28 |
+
Language('fi', 'Finnish'),
|
29 |
+
Language('vi', 'Vietnamese'),
|
30 |
+
Language('he', 'Hebrew'),
|
31 |
+
Language('uk', 'Ukrainian'),
|
32 |
+
Language('el', 'Greek'),
|
33 |
+
Language('ms', 'Malay'),
|
34 |
+
Language('cs', 'Czech'),
|
35 |
+
Language('ro', 'Romanian'),
|
36 |
+
Language('da', 'Danish'),
|
37 |
+
Language('hu', 'Hungarian'),
|
38 |
+
Language('ta', 'Tamil'),
|
39 |
+
Language('no', 'Norwegian'),
|
40 |
+
Language('th', 'Thai'),
|
41 |
+
Language('ur', 'Urdu'),
|
42 |
+
Language('hr', 'Croatian'),
|
43 |
+
Language('bg', 'Bulgarian'),
|
44 |
+
Language('lt', 'Lithuanian'),
|
45 |
+
Language('la', 'Latin'),
|
46 |
+
Language('mi', 'Maori'),
|
47 |
+
Language('ml', 'Malayalam'),
|
48 |
+
Language('cy', 'Welsh'),
|
49 |
+
Language('sk', 'Slovak'),
|
50 |
+
Language('te', 'Telugu'),
|
51 |
+
Language('fa', 'Persian'),
|
52 |
+
Language('lv', 'Latvian'),
|
53 |
+
Language('bn', 'Bengali'),
|
54 |
+
Language('sr', 'Serbian'),
|
55 |
+
Language('az', 'Azerbaijani'),
|
56 |
+
Language('sl', 'Slovenian'),
|
57 |
+
Language('kn', 'Kannada'),
|
58 |
+
Language('et', 'Estonian'),
|
59 |
+
Language('mk', 'Macedonian'),
|
60 |
+
Language('br', 'Breton'),
|
61 |
+
Language('eu', 'Basque'),
|
62 |
+
Language('is', 'Icelandic'),
|
63 |
+
Language('hy', 'Armenian'),
|
64 |
+
Language('ne', 'Nepali'),
|
65 |
+
Language('mn', 'Mongolian'),
|
66 |
+
Language('bs', 'Bosnian'),
|
67 |
+
Language('kk', 'Kazakh'),
|
68 |
+
Language('sq', 'Albanian'),
|
69 |
+
Language('sw', 'Swahili'),
|
70 |
+
Language('gl', 'Galician'),
|
71 |
+
Language('mr', 'Marathi'),
|
72 |
+
Language('pa', 'Punjabi'),
|
73 |
+
Language('si', 'Sinhala'),
|
74 |
+
Language('km', 'Khmer'),
|
75 |
+
Language('sn', 'Shona'),
|
76 |
+
Language('yo', 'Yoruba'),
|
77 |
+
Language('so', 'Somali'),
|
78 |
+
Language('af', 'Afrikaans'),
|
79 |
+
Language('oc', 'Occitan'),
|
80 |
+
Language('ka', 'Georgian'),
|
81 |
+
Language('be', 'Belarusian'),
|
82 |
+
Language('tg', 'Tajik'),
|
83 |
+
Language('sd', 'Sindhi'),
|
84 |
+
Language('gu', 'Gujarati'),
|
85 |
+
Language('am', 'Amharic'),
|
86 |
+
Language('yi', 'Yiddish'),
|
87 |
+
Language('lo', 'Lao'),
|
88 |
+
Language('uz', 'Uzbek'),
|
89 |
+
Language('fo', 'Faroese'),
|
90 |
+
Language('ht', 'Haitian creole'),
|
91 |
+
Language('ps', 'Pashto'),
|
92 |
+
Language('tk', 'Turkmen'),
|
93 |
+
Language('nn', 'Nynorsk'),
|
94 |
+
Language('mt', 'Maltese'),
|
95 |
+
Language('sa', 'Sanskrit'),
|
96 |
+
Language('lb', 'Luxembourgish'),
|
97 |
+
Language('my', 'Myanmar'),
|
98 |
+
Language('bo', 'Tibetan'),
|
99 |
+
Language('tl', 'Tagalog'),
|
100 |
+
Language('mg', 'Malagasy'),
|
101 |
+
Language('as', 'Assamese'),
|
102 |
+
Language('tt', 'Tatar'),
|
103 |
+
Language('haw', 'Hawaiian'),
|
104 |
+
Language('ln', 'Lingala'),
|
105 |
+
Language('ha', 'Hausa'),
|
106 |
+
Language('ba', 'Bashkir'),
|
107 |
+
Language('jw', 'Javanese'),
|
108 |
+
Language('su', 'Sundanese')
|
109 |
+
]
|
110 |
+
|
111 |
+
_TO_LANGUAGE_CODE = {
|
112 |
+
**{language.code: language for language in LANGUAGES},
|
113 |
+
"burmese": "my",
|
114 |
+
"valencian": "ca",
|
115 |
+
"flemish": "nl",
|
116 |
+
"haitian": "ht",
|
117 |
+
"letzeburgesch": "lb",
|
118 |
+
"pushto": "ps",
|
119 |
+
"panjabi": "pa",
|
120 |
+
"moldavian": "ro",
|
121 |
+
"moldovan": "ro",
|
122 |
+
"sinhalese": "si",
|
123 |
+
"castilian": "es",
|
124 |
+
}
|
125 |
+
|
126 |
+
_FROM_LANGUAGE_NAME = {
|
127 |
+
**{language.name.lower(): language for language in LANGUAGES}
|
128 |
+
}
|
129 |
+
|
130 |
+
def get_language_from_code(language_code, default=None) -> Language:
|
131 |
+
"""Return the language name from the language code."""
|
132 |
+
return _TO_LANGUAGE_CODE.get(language_code, default)
|
133 |
+
|
134 |
+
def get_language_from_name(language, default=None) -> Language:
|
135 |
+
"""Return the language code from the language name."""
|
136 |
+
return _FROM_LANGUAGE_NAME.get(language.lower() if language else None, default)
|
137 |
+
|
138 |
+
def get_language_names():
|
139 |
+
"""Return a list of language names."""
|
140 |
+
return [language.name for language in LANGUAGES]
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
# Test lookup
|
144 |
+
print(get_language_from_code('en'))
|
145 |
+
print(get_language_from_name('English'))
|
146 |
+
print(get_language_names())
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio==4.16.0
|
2 |
+
faster-whisper @ git+https://github.com/guillaumekln/faster-whisper@master
|
subtitle_manager.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
class Subtitle():
|
4 |
+
def __init__(self,ext="srt"):
|
5 |
+
sub_dict = {
|
6 |
+
"srt":{
|
7 |
+
"coma": ",",
|
8 |
+
"header": "",
|
9 |
+
"format": lambda i,start,end,text : f"{i + 1}\n{self.timeformat(start)} --> {self.timeformat(end if end != None else start)}\n{text}\n\n",
|
10 |
+
},
|
11 |
+
"vtt":{
|
12 |
+
"coma": ".",
|
13 |
+
"header": "WebVTT\n\n",
|
14 |
+
"format": lambda i,start,end,text : f"{self.timeformat(start)} --> {self.timeformat(end if end != None else start)}\n{text}\n\n",
|
15 |
+
},
|
16 |
+
"txt":{
|
17 |
+
"coma": "",
|
18 |
+
"header": "",
|
19 |
+
"format": lambda i,start,end,text : f"{text}\n",
|
20 |
+
},
|
21 |
+
}
|
22 |
+
|
23 |
+
self.ext = ext
|
24 |
+
self.coma = sub_dict[ext]["coma"]
|
25 |
+
self.header = sub_dict[ext]["header"]
|
26 |
+
self.format = sub_dict[ext]["format"]
|
27 |
+
|
28 |
+
def timeformat(self,time):
|
29 |
+
hours = time // 3600
|
30 |
+
minutes = (time - hours * 3600) // 60
|
31 |
+
seconds = time - hours * 3600 - minutes * 60
|
32 |
+
milliseconds = (time - int(time)) * 1000
|
33 |
+
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}{self.coma}{int(milliseconds):03d}"
|
34 |
+
|
35 |
+
def get_subtitle(self,segments, progress):
|
36 |
+
output = self.header
|
37 |
+
# for i, segment in enumerate(segments):
|
38 |
+
for i, segment in enumerate(progress.tqdm(segments, desc="Whisper working...")):
|
39 |
+
text = segment.text
|
40 |
+
if text.startswith(' '):
|
41 |
+
text = text[1:]
|
42 |
+
try:
|
43 |
+
result = self.format(i,segment.start, segment.end, text)
|
44 |
+
output += result
|
45 |
+
# logging.info(result)
|
46 |
+
except Exception as e:
|
47 |
+
logging.error(e,segment)
|
48 |
+
return output
|
49 |
+
|
50 |
+
def write_subtitle(self, segments, output_file, model, progress):
|
51 |
+
# output_file = output_file.split('.')[0]
|
52 |
+
output_file += f".({model})."+self.ext
|
53 |
+
subtitle = self.get_subtitle(segments,progress)
|
54 |
+
|
55 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
56 |
+
f.write(subtitle)
|
57 |
+
return output_file
|