Spaces:
Running
Running
Merge branch 'main' of https://huggingface.co/spaces/aadnk/whisper-webui into main
Browse files- app.py +19 -23
- cli.py +3 -2
- src/languages.py +147 -0
- src/whisper/fasterWhisperContainer.py +6 -20
app.py
CHANGED
@@ -16,6 +16,7 @@ from src.config import ApplicationConfig
|
|
16 |
from src.hooks.progressListener import ProgressListener
|
17 |
from src.hooks.subTaskProgressListener import SubTaskProgressListener
|
18 |
from src.hooks.whisperProgressHook import create_progress_listener_handle
|
|
|
19 |
from src.modelCache import ModelCache
|
20 |
from src.source import get_audio_source_collection
|
21 |
from src.vadParallel import ParallelContext, ParallelTranscription
|
@@ -40,26 +41,6 @@ MAX_FILE_PREFIX_LENGTH = 17
|
|
40 |
# Limit auto_parallel to a certain number of CPUs (specify vad_cpu_cores to get a higher number)
|
41 |
MAX_AUTO_CPU_CORES = 8
|
42 |
|
43 |
-
LANGUAGES = [
|
44 |
-
"English", "Chinese", "German", "Spanish", "Russian", "Korean",
|
45 |
-
"French", "Japanese", "Portuguese", "Turkish", "Polish", "Catalan",
|
46 |
-
"Dutch", "Arabic", "Swedish", "Italian", "Indonesian", "Hindi",
|
47 |
-
"Finnish", "Vietnamese", "Hebrew", "Ukrainian", "Greek", "Malay",
|
48 |
-
"Czech", "Romanian", "Danish", "Hungarian", "Tamil", "Norwegian",
|
49 |
-
"Thai", "Urdu", "Croatian", "Bulgarian", "Lithuanian", "Latin",
|
50 |
-
"Maori", "Malayalam", "Welsh", "Slovak", "Telugu", "Persian",
|
51 |
-
"Latvian", "Bengali", "Serbian", "Azerbaijani", "Slovenian",
|
52 |
-
"Kannada", "Estonian", "Macedonian", "Breton", "Basque", "Icelandic",
|
53 |
-
"Armenian", "Nepali", "Mongolian", "Bosnian", "Kazakh", "Albanian",
|
54 |
-
"Swahili", "Galician", "Marathi", "Punjabi", "Sinhala", "Khmer",
|
55 |
-
"Shona", "Yoruba", "Somali", "Afrikaans", "Occitan", "Georgian",
|
56 |
-
"Belarusian", "Tajik", "Sindhi", "Gujarati", "Amharic", "Yiddish",
|
57 |
-
"Lao", "Uzbek", "Faroese", "Haitian Creole", "Pashto", "Turkmen",
|
58 |
-
"Nynorsk", "Maltese", "Sanskrit", "Luxembourgish", "Myanmar", "Tibetan",
|
59 |
-
"Tagalog", "Malagasy", "Assamese", "Tatar", "Hawaiian", "Lingala",
|
60 |
-
"Hausa", "Bashkir", "Javanese", "Sundanese"
|
61 |
-
]
|
62 |
-
|
63 |
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
|
64 |
|
65 |
class WhisperTranscriber:
|
@@ -418,22 +399,37 @@ def create_ui(app_config: ApplicationConfig):
|
|
418 |
ui.set_parallel_devices(app_config.vad_parallel_devices)
|
419 |
ui.set_auto_parallel(app_config.auto_parallel)
|
420 |
|
421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
|
423 |
ui_description += " as well as speech translation and language identification. "
|
424 |
|
425 |
ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
|
426 |
|
|
|
|
|
|
|
|
|
427 |
if app_config.input_audio_max_duration > 0:
|
428 |
ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
|
429 |
|
430 |
-
ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
|
431 |
|
432 |
whisper_models = app_config.get_model_names()
|
433 |
|
434 |
simple_inputs = lambda : [
|
435 |
gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
|
436 |
-
gr.Dropdown(choices=sorted(
|
437 |
gr.Text(label="URL (YouTube, etc.)"),
|
438 |
gr.File(label="Upload Files", file_count="multiple"),
|
439 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
|
|
16 |
from src.hooks.progressListener import ProgressListener
|
17 |
from src.hooks.subTaskProgressListener import SubTaskProgressListener
|
18 |
from src.hooks.whisperProgressHook import create_progress_listener_handle
|
19 |
+
from src.languages import get_language_names
|
20 |
from src.modelCache import ModelCache
|
21 |
from src.source import get_audio_source_collection
|
22 |
from src.vadParallel import ParallelContext, ParallelTranscription
|
|
|
41 |
# Limit auto_parallel to a certain number of CPUs (specify vad_cpu_cores to get a higher number)
|
42 |
MAX_AUTO_CPU_CORES = 8
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
|
45 |
|
46 |
class WhisperTranscriber:
|
|
|
399 |
ui.set_parallel_devices(app_config.vad_parallel_devices)
|
400 |
ui.set_auto_parallel(app_config.auto_parallel)
|
401 |
|
402 |
+
is_whisper = False
|
403 |
+
|
404 |
+
if app_config.whisper_implementation == "whisper":
|
405 |
+
implementation_name = "Whisper"
|
406 |
+
is_whisper = True
|
407 |
+
elif app_config.whisper_implementation in ["faster-whisper", "faster_whisper"]:
|
408 |
+
implementation_name = "Faster Whisper"
|
409 |
+
else:
|
410 |
+
# Try to convert from camel-case to title-case
|
411 |
+
implementation_name = app_config.whisper_implementation.title().replace("_", " ").replace("-", " ")
|
412 |
+
|
413 |
+
ui_description = implementation_name + " is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
|
414 |
ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
|
415 |
ui_description += " as well as speech translation and language identification. "
|
416 |
|
417 |
ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
|
418 |
|
419 |
+
# Recommend faster-whisper
|
420 |
+
if is_whisper:
|
421 |
+
ui_description += "\n\n\n\nFor faster inference on GPU, try [faster-whisper](https://huggingface.co/spaces/aadnk/faster-whisper-webui)."
|
422 |
+
|
423 |
if app_config.input_audio_max_duration > 0:
|
424 |
ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
|
425 |
|
426 |
+
ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
|
427 |
|
428 |
whisper_models = app_config.get_model_names()
|
429 |
|
430 |
simple_inputs = lambda : [
|
431 |
gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
|
432 |
+
gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
|
433 |
gr.Text(label="URL (YouTube, etc.)"),
|
434 |
gr.File(label="Upload Files", file_count="multiple"),
|
435 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
cli.py
CHANGED
@@ -6,9 +6,10 @@ import warnings
|
|
6 |
import numpy as np
|
7 |
|
8 |
import torch
|
9 |
-
from app import
|
10 |
from src.config import ApplicationConfig
|
11 |
from src.download import download_url
|
|
|
12 |
|
13 |
from src.utils import optional_float, optional_int, str2bool
|
14 |
from src.whisper.whisperFactory import create_whisper_container
|
@@ -41,7 +42,7 @@ def cli():
|
|
41 |
|
42 |
parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
|
43 |
help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
44 |
-
parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(
|
45 |
help="language spoken in the audio, specify None to perform language detection")
|
46 |
|
47 |
parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
|
|
|
6 |
import numpy as np
|
7 |
|
8 |
import torch
|
9 |
+
from app import WhisperTranscriber
|
10 |
from src.config import ApplicationConfig
|
11 |
from src.download import download_url
|
12 |
+
from src.languages import get_language_names
|
13 |
|
14 |
from src.utils import optional_float, optional_int, str2bool
|
15 |
from src.whisper.whisperFactory import create_whisper_container
|
|
|
42 |
|
43 |
parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
|
44 |
help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
45 |
+
parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(get_language_names()), \
|
46 |
help="language spoken in the audio, specify None to perform language detection")
|
47 |
|
48 |
parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
|
src/languages.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Language():
|
2 |
+
def __init__(self, code, name):
|
3 |
+
self.code = code
|
4 |
+
self.name = name
|
5 |
+
|
6 |
+
def __str__(self):
|
7 |
+
return "Language(code={}, name={})".format(self.code, self.name)
|
8 |
+
|
9 |
+
LANGUAGES = [
|
10 |
+
Language('en', 'English'),
|
11 |
+
Language('zh', 'Chinese'),
|
12 |
+
Language('de', 'German'),
|
13 |
+
Language('es', 'Spanish'),
|
14 |
+
Language('ru', 'Russian'),
|
15 |
+
Language('ko', 'Korean'),
|
16 |
+
Language('fr', 'French'),
|
17 |
+
Language('ja', 'Japanese'),
|
18 |
+
Language('pt', 'Portuguese'),
|
19 |
+
Language('tr', 'Turkish'),
|
20 |
+
Language('pl', 'Polish'),
|
21 |
+
Language('ca', 'Catalan'),
|
22 |
+
Language('nl', 'Dutch'),
|
23 |
+
Language('ar', 'Arabic'),
|
24 |
+
Language('sv', 'Swedish'),
|
25 |
+
Language('it', 'Italian'),
|
26 |
+
Language('id', 'Indonesian'),
|
27 |
+
Language('hi', 'Hindi'),
|
28 |
+
Language('fi', 'Finnish'),
|
29 |
+
Language('vi', 'Vietnamese'),
|
30 |
+
Language('he', 'Hebrew'),
|
31 |
+
Language('uk', 'Ukrainian'),
|
32 |
+
Language('el', 'Greek'),
|
33 |
+
Language('ms', 'Malay'),
|
34 |
+
Language('cs', 'Czech'),
|
35 |
+
Language('ro', 'Romanian'),
|
36 |
+
Language('da', 'Danish'),
|
37 |
+
Language('hu', 'Hungarian'),
|
38 |
+
Language('ta', 'Tamil'),
|
39 |
+
Language('no', 'Norwegian'),
|
40 |
+
Language('th', 'Thai'),
|
41 |
+
Language('ur', 'Urdu'),
|
42 |
+
Language('hr', 'Croatian'),
|
43 |
+
Language('bg', 'Bulgarian'),
|
44 |
+
Language('lt', 'Lithuanian'),
|
45 |
+
Language('la', 'Latin'),
|
46 |
+
Language('mi', 'Maori'),
|
47 |
+
Language('ml', 'Malayalam'),
|
48 |
+
Language('cy', 'Welsh'),
|
49 |
+
Language('sk', 'Slovak'),
|
50 |
+
Language('te', 'Telugu'),
|
51 |
+
Language('fa', 'Persian'),
|
52 |
+
Language('lv', 'Latvian'),
|
53 |
+
Language('bn', 'Bengali'),
|
54 |
+
Language('sr', 'Serbian'),
|
55 |
+
Language('az', 'Azerbaijani'),
|
56 |
+
Language('sl', 'Slovenian'),
|
57 |
+
Language('kn', 'Kannada'),
|
58 |
+
Language('et', 'Estonian'),
|
59 |
+
Language('mk', 'Macedonian'),
|
60 |
+
Language('br', 'Breton'),
|
61 |
+
Language('eu', 'Basque'),
|
62 |
+
Language('is', 'Icelandic'),
|
63 |
+
Language('hy', 'Armenian'),
|
64 |
+
Language('ne', 'Nepali'),
|
65 |
+
Language('mn', 'Mongolian'),
|
66 |
+
Language('bs', 'Bosnian'),
|
67 |
+
Language('kk', 'Kazakh'),
|
68 |
+
Language('sq', 'Albanian'),
|
69 |
+
Language('sw', 'Swahili'),
|
70 |
+
Language('gl', 'Galician'),
|
71 |
+
Language('mr', 'Marathi'),
|
72 |
+
Language('pa', 'Punjabi'),
|
73 |
+
Language('si', 'Sinhala'),
|
74 |
+
Language('km', 'Khmer'),
|
75 |
+
Language('sn', 'Shona'),
|
76 |
+
Language('yo', 'Yoruba'),
|
77 |
+
Language('so', 'Somali'),
|
78 |
+
Language('af', 'Afrikaans'),
|
79 |
+
Language('oc', 'Occitan'),
|
80 |
+
Language('ka', 'Georgian'),
|
81 |
+
Language('be', 'Belarusian'),
|
82 |
+
Language('tg', 'Tajik'),
|
83 |
+
Language('sd', 'Sindhi'),
|
84 |
+
Language('gu', 'Gujarati'),
|
85 |
+
Language('am', 'Amharic'),
|
86 |
+
Language('yi', 'Yiddish'),
|
87 |
+
Language('lo', 'Lao'),
|
88 |
+
Language('uz', 'Uzbek'),
|
89 |
+
Language('fo', 'Faroese'),
|
90 |
+
Language('ht', 'Haitian creole'),
|
91 |
+
Language('ps', 'Pashto'),
|
92 |
+
Language('tk', 'Turkmen'),
|
93 |
+
Language('nn', 'Nynorsk'),
|
94 |
+
Language('mt', 'Maltese'),
|
95 |
+
Language('sa', 'Sanskrit'),
|
96 |
+
Language('lb', 'Luxembourgish'),
|
97 |
+
Language('my', 'Myanmar'),
|
98 |
+
Language('bo', 'Tibetan'),
|
99 |
+
Language('tl', 'Tagalog'),
|
100 |
+
Language('mg', 'Malagasy'),
|
101 |
+
Language('as', 'Assamese'),
|
102 |
+
Language('tt', 'Tatar'),
|
103 |
+
Language('haw', 'Hawaiian'),
|
104 |
+
Language('ln', 'Lingala'),
|
105 |
+
Language('ha', 'Hausa'),
|
106 |
+
Language('ba', 'Bashkir'),
|
107 |
+
Language('jw', 'Javanese'),
|
108 |
+
Language('su', 'Sundanese')
|
109 |
+
]
|
110 |
+
|
111 |
+
_TO_LANGUAGE_CODE = {
|
112 |
+
**{language.code: language for language in LANGUAGES},
|
113 |
+
"burmese": "my",
|
114 |
+
"valencian": "ca",
|
115 |
+
"flemish": "nl",
|
116 |
+
"haitian": "ht",
|
117 |
+
"letzeburgesch": "lb",
|
118 |
+
"pushto": "ps",
|
119 |
+
"panjabi": "pa",
|
120 |
+
"moldavian": "ro",
|
121 |
+
"moldovan": "ro",
|
122 |
+
"sinhalese": "si",
|
123 |
+
"castilian": "es",
|
124 |
+
}
|
125 |
+
|
126 |
+
_FROM_LANGUAGE_NAME = {
|
127 |
+
**{language.name.lower(): language for language in LANGUAGES}
|
128 |
+
}
|
129 |
+
|
130 |
+
def get_language_from_code(language_code, default=None) -> Language:
|
131 |
+
"""Return the language name from the language code."""
|
132 |
+
return _TO_LANGUAGE_CODE.get(language_code, default)
|
133 |
+
|
134 |
+
def get_language_from_name(language, default=None) -> Language:
|
135 |
+
"""Return the language code from the language name."""
|
136 |
+
return _FROM_LANGUAGE_NAME.get(language.lower() if language else None, default)
|
137 |
+
|
138 |
+
def get_language_names():
|
139 |
+
"""Return a list of language names."""
|
140 |
+
return [language.name for language in LANGUAGES]
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
# Test lookup
|
144 |
+
print(get_language_from_code('en'))
|
145 |
+
print(get_language_from_name('English'))
|
146 |
+
|
147 |
+
print(get_language_names())
|
src/whisper/fasterWhisperContainer.py
CHANGED
@@ -4,6 +4,7 @@ from typing import List, Union
|
|
4 |
from faster_whisper import WhisperModel, download_model
|
5 |
from src.config import ModelConfig
|
6 |
from src.hooks.progressListener import ProgressListener
|
|
|
7 |
from src.modelCache import ModelCache
|
8 |
from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
|
9 |
|
@@ -179,24 +180,9 @@ class FasterWhisperCallback(AbstractWhisperCallback):
|
|
179 |
return [int(token) for token in suppress_tokens.split(",")]
|
180 |
|
181 |
def _lookup_language_code(self, language: str):
|
182 |
-
|
183 |
-
"english": "en", "chinese": "zh", "german": "de", "spanish": "es", "russian": "ru", "korean": "ko",
|
184 |
-
"french": "fr", "japanese": "ja", "portuguese": "pt", "turkish": "tr", "polish": "pl", "catalan": "ca",
|
185 |
-
"dutch": "nl", "arabic": "ar", "swedish": "sv", "italian": "it", "indonesian": "id", "hindi": "hi",
|
186 |
-
"finnish": "fi", "vietnamese": "vi", "hebrew": "he", "ukrainian": "uk", "greek": "el", "malay": "ms",
|
187 |
-
"czech": "cs", "romanian": "ro", "danish": "da", "hungarian": "hu", "tamil": "ta", "norwegian": "no",
|
188 |
-
"thai": "th", "urdu": "ur", "croatian": "hr", "bulgarian": "bg", "lithuanian": "lt", "latin": "la",
|
189 |
-
"maori": "mi", "malayalam": "ml", "welsh": "cy", "slovak": "sk", "telugu": "te", "persian": "fa",
|
190 |
-
"latvian": "lv", "bengali": "bn", "serbian": "sr", "azerbaijani": "az", "slovenian": "sl",
|
191 |
-
"kannada": "kn", "estonian": "et", "macedonian": "mk", "breton": "br", "basque": "eu", "icelandic": "is",
|
192 |
-
"armenian": "hy", "nepali": "ne", "mongolian": "mn", "bosnian": "bs", "kazakh": "kk", "albanian": "sq",
|
193 |
-
"swahili": "sw", "galician": "gl", "marathi": "mr", "punjabi": "pa", "sinhala": "si", "khmer": "km",
|
194 |
-
"shona": "sn", "yoruba": "yo", "somali": "so", "afrikaans": "af", "occitan": "oc", "georgian": "ka",
|
195 |
-
"belarusian": "be", "tajik": "tg", "sindhi": "sd", "gujarati": "gu", "amharic": "am", "yiddish": "yi",
|
196 |
-
"lao": "lo", "uzbek": "uz", "faroese": "fo", "haitian creole": "ht", "pashto": "ps", "turkmen": "tk",
|
197 |
-
"nynorsk": "nn", "maltese": "mt", "sanskrit": "sa", "luxembourgish": "lb", "myanmar": "my", "tibetan": "bo",
|
198 |
-
"tagalog": "tl", "malagasy": "mg", "assamese": "as", "tatar": "tt", "hawaiian": "haw", "lingala": "ln",
|
199 |
-
"hausa": "ha", "bashkir": "ba", "javanese": "jv", "sundanese": "su"
|
200 |
-
}
|
201 |
|
202 |
-
|
|
|
|
|
|
|
|
4 |
from faster_whisper import WhisperModel, download_model
|
5 |
from src.config import ModelConfig
|
6 |
from src.hooks.progressListener import ProgressListener
|
7 |
+
from src.languages import get_language_from_name
|
8 |
from src.modelCache import ModelCache
|
9 |
from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
|
10 |
|
|
|
180 |
return [int(token) for token in suppress_tokens.split(",")]
|
181 |
|
182 |
def _lookup_language_code(self, language: str):
|
183 |
+
language = get_language_from_name(language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
+
if language is None:
|
186 |
+
raise ValueError("Invalid language: " + language)
|
187 |
+
|
188 |
+
return language.code
|