aadnk commited on
Commit
aa3be84
2 Parent(s): 4051119 751fc8f

Merge branch 'main' of https://huggingface.co/spaces/aadnk/whisper-webui into main

Browse files
Files changed (4) hide show
  1. app.py +19 -23
  2. cli.py +3 -2
  3. src/languages.py +147 -0
  4. src/whisper/fasterWhisperContainer.py +6 -20
app.py CHANGED
@@ -16,6 +16,7 @@ from src.config import ApplicationConfig
16
  from src.hooks.progressListener import ProgressListener
17
  from src.hooks.subTaskProgressListener import SubTaskProgressListener
18
  from src.hooks.whisperProgressHook import create_progress_listener_handle
 
19
  from src.modelCache import ModelCache
20
  from src.source import get_audio_source_collection
21
  from src.vadParallel import ParallelContext, ParallelTranscription
@@ -40,26 +41,6 @@ MAX_FILE_PREFIX_LENGTH = 17
40
  # Limit auto_parallel to a certain number of CPUs (specify vad_cpu_cores to get a higher number)
41
  MAX_AUTO_CPU_CORES = 8
42
 
43
- LANGUAGES = [
44
- "English", "Chinese", "German", "Spanish", "Russian", "Korean",
45
- "French", "Japanese", "Portuguese", "Turkish", "Polish", "Catalan",
46
- "Dutch", "Arabic", "Swedish", "Italian", "Indonesian", "Hindi",
47
- "Finnish", "Vietnamese", "Hebrew", "Ukrainian", "Greek", "Malay",
48
- "Czech", "Romanian", "Danish", "Hungarian", "Tamil", "Norwegian",
49
- "Thai", "Urdu", "Croatian", "Bulgarian", "Lithuanian", "Latin",
50
- "Maori", "Malayalam", "Welsh", "Slovak", "Telugu", "Persian",
51
- "Latvian", "Bengali", "Serbian", "Azerbaijani", "Slovenian",
52
- "Kannada", "Estonian", "Macedonian", "Breton", "Basque", "Icelandic",
53
- "Armenian", "Nepali", "Mongolian", "Bosnian", "Kazakh", "Albanian",
54
- "Swahili", "Galician", "Marathi", "Punjabi", "Sinhala", "Khmer",
55
- "Shona", "Yoruba", "Somali", "Afrikaans", "Occitan", "Georgian",
56
- "Belarusian", "Tajik", "Sindhi", "Gujarati", "Amharic", "Yiddish",
57
- "Lao", "Uzbek", "Faroese", "Haitian Creole", "Pashto", "Turkmen",
58
- "Nynorsk", "Maltese", "Sanskrit", "Luxembourgish", "Myanmar", "Tibetan",
59
- "Tagalog", "Malagasy", "Assamese", "Tatar", "Hawaiian", "Lingala",
60
- "Hausa", "Bashkir", "Javanese", "Sundanese"
61
- ]
62
-
63
  WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
64
 
65
  class WhisperTranscriber:
@@ -418,22 +399,37 @@ def create_ui(app_config: ApplicationConfig):
418
  ui.set_parallel_devices(app_config.vad_parallel_devices)
419
  ui.set_auto_parallel(app_config.auto_parallel)
420
 
421
- ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
 
 
 
 
 
 
 
 
 
 
 
422
  ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
423
  ui_description += " as well as speech translation and language identification. "
424
 
425
  ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
426
 
 
 
 
 
427
  if app_config.input_audio_max_duration > 0:
428
  ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
429
 
430
- ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
431
 
432
  whisper_models = app_config.get_model_names()
433
 
434
  simple_inputs = lambda : [
435
  gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
436
- gr.Dropdown(choices=sorted(LANGUAGES), label="Language", value=app_config.language),
437
  gr.Text(label="URL (YouTube, etc.)"),
438
  gr.File(label="Upload Files", file_count="multiple"),
439
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
 
16
  from src.hooks.progressListener import ProgressListener
17
  from src.hooks.subTaskProgressListener import SubTaskProgressListener
18
  from src.hooks.whisperProgressHook import create_progress_listener_handle
19
+ from src.languages import get_language_names
20
  from src.modelCache import ModelCache
21
  from src.source import get_audio_source_collection
22
  from src.vadParallel import ParallelContext, ParallelTranscription
 
41
  # Limit auto_parallel to a certain number of CPUs (specify vad_cpu_cores to get a higher number)
42
  MAX_AUTO_CPU_CORES = 8
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
45
 
46
  class WhisperTranscriber:
 
399
  ui.set_parallel_devices(app_config.vad_parallel_devices)
400
  ui.set_auto_parallel(app_config.auto_parallel)
401
 
402
+ is_whisper = False
403
+
404
+ if app_config.whisper_implementation == "whisper":
405
+ implementation_name = "Whisper"
406
+ is_whisper = True
407
+ elif app_config.whisper_implementation in ["faster-whisper", "faster_whisper"]:
408
+ implementation_name = "Faster Whisper"
409
+ else:
410
+ # Try to convert from camel-case to title-case
411
+ implementation_name = app_config.whisper_implementation.title().replace("_", " ").replace("-", " ")
412
+
413
+ ui_description = implementation_name + " is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
414
  ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
415
  ui_description += " as well as speech translation and language identification. "
416
 
417
  ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
418
 
419
+ # Recommend faster-whisper
420
+ if is_whisper:
421
+ ui_description += "\n\n\n\nFor faster inference on GPU, try [faster-whisper](https://huggingface.co/spaces/aadnk/faster-whisper-webui)."
422
+
423
  if app_config.input_audio_max_duration > 0:
424
  ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
425
 
426
+ ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
427
 
428
  whisper_models = app_config.get_model_names()
429
 
430
  simple_inputs = lambda : [
431
  gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
432
+ gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
433
  gr.Text(label="URL (YouTube, etc.)"),
434
  gr.File(label="Upload Files", file_count="multiple"),
435
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
cli.py CHANGED
@@ -6,9 +6,10 @@ import warnings
6
  import numpy as np
7
 
8
  import torch
9
- from app import LANGUAGES, WhisperTranscriber
10
  from src.config import ApplicationConfig
11
  from src.download import download_url
 
12
 
13
  from src.utils import optional_float, optional_int, str2bool
14
  from src.whisper.whisperFactory import create_whisper_container
@@ -41,7 +42,7 @@ def cli():
41
 
42
  parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
43
  help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
44
- parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(LANGUAGES), \
45
  help="language spoken in the audio, specify None to perform language detection")
46
 
47
  parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
 
6
  import numpy as np
7
 
8
  import torch
9
+ from app import WhisperTranscriber
10
  from src.config import ApplicationConfig
11
  from src.download import download_url
12
+ from src.languages import get_language_names
13
 
14
  from src.utils import optional_float, optional_int, str2bool
15
  from src.whisper.whisperFactory import create_whisper_container
 
42
 
43
  parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
44
  help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
45
+ parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(get_language_names()), \
46
  help="language spoken in the audio, specify None to perform language detection")
47
 
48
  parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
src/languages.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Language():
2
+ def __init__(self, code, name):
3
+ self.code = code
4
+ self.name = name
5
+
6
+ def __str__(self):
7
+ return "Language(code={}, name={})".format(self.code, self.name)
8
+
9
+ LANGUAGES = [
10
+ Language('en', 'English'),
11
+ Language('zh', 'Chinese'),
12
+ Language('de', 'German'),
13
+ Language('es', 'Spanish'),
14
+ Language('ru', 'Russian'),
15
+ Language('ko', 'Korean'),
16
+ Language('fr', 'French'),
17
+ Language('ja', 'Japanese'),
18
+ Language('pt', 'Portuguese'),
19
+ Language('tr', 'Turkish'),
20
+ Language('pl', 'Polish'),
21
+ Language('ca', 'Catalan'),
22
+ Language('nl', 'Dutch'),
23
+ Language('ar', 'Arabic'),
24
+ Language('sv', 'Swedish'),
25
+ Language('it', 'Italian'),
26
+ Language('id', 'Indonesian'),
27
+ Language('hi', 'Hindi'),
28
+ Language('fi', 'Finnish'),
29
+ Language('vi', 'Vietnamese'),
30
+ Language('he', 'Hebrew'),
31
+ Language('uk', 'Ukrainian'),
32
+ Language('el', 'Greek'),
33
+ Language('ms', 'Malay'),
34
+ Language('cs', 'Czech'),
35
+ Language('ro', 'Romanian'),
36
+ Language('da', 'Danish'),
37
+ Language('hu', 'Hungarian'),
38
+ Language('ta', 'Tamil'),
39
+ Language('no', 'Norwegian'),
40
+ Language('th', 'Thai'),
41
+ Language('ur', 'Urdu'),
42
+ Language('hr', 'Croatian'),
43
+ Language('bg', 'Bulgarian'),
44
+ Language('lt', 'Lithuanian'),
45
+ Language('la', 'Latin'),
46
+ Language('mi', 'Maori'),
47
+ Language('ml', 'Malayalam'),
48
+ Language('cy', 'Welsh'),
49
+ Language('sk', 'Slovak'),
50
+ Language('te', 'Telugu'),
51
+ Language('fa', 'Persian'),
52
+ Language('lv', 'Latvian'),
53
+ Language('bn', 'Bengali'),
54
+ Language('sr', 'Serbian'),
55
+ Language('az', 'Azerbaijani'),
56
+ Language('sl', 'Slovenian'),
57
+ Language('kn', 'Kannada'),
58
+ Language('et', 'Estonian'),
59
+ Language('mk', 'Macedonian'),
60
+ Language('br', 'Breton'),
61
+ Language('eu', 'Basque'),
62
+ Language('is', 'Icelandic'),
63
+ Language('hy', 'Armenian'),
64
+ Language('ne', 'Nepali'),
65
+ Language('mn', 'Mongolian'),
66
+ Language('bs', 'Bosnian'),
67
+ Language('kk', 'Kazakh'),
68
+ Language('sq', 'Albanian'),
69
+ Language('sw', 'Swahili'),
70
+ Language('gl', 'Galician'),
71
+ Language('mr', 'Marathi'),
72
+ Language('pa', 'Punjabi'),
73
+ Language('si', 'Sinhala'),
74
+ Language('km', 'Khmer'),
75
+ Language('sn', 'Shona'),
76
+ Language('yo', 'Yoruba'),
77
+ Language('so', 'Somali'),
78
+ Language('af', 'Afrikaans'),
79
+ Language('oc', 'Occitan'),
80
+ Language('ka', 'Georgian'),
81
+ Language('be', 'Belarusian'),
82
+ Language('tg', 'Tajik'),
83
+ Language('sd', 'Sindhi'),
84
+ Language('gu', 'Gujarati'),
85
+ Language('am', 'Amharic'),
86
+ Language('yi', 'Yiddish'),
87
+ Language('lo', 'Lao'),
88
+ Language('uz', 'Uzbek'),
89
+ Language('fo', 'Faroese'),
90
+ Language('ht', 'Haitian creole'),
91
+ Language('ps', 'Pashto'),
92
+ Language('tk', 'Turkmen'),
93
+ Language('nn', 'Nynorsk'),
94
+ Language('mt', 'Maltese'),
95
+ Language('sa', 'Sanskrit'),
96
+ Language('lb', 'Luxembourgish'),
97
+ Language('my', 'Myanmar'),
98
+ Language('bo', 'Tibetan'),
99
+ Language('tl', 'Tagalog'),
100
+ Language('mg', 'Malagasy'),
101
+ Language('as', 'Assamese'),
102
+ Language('tt', 'Tatar'),
103
+ Language('haw', 'Hawaiian'),
104
+ Language('ln', 'Lingala'),
105
+ Language('ha', 'Hausa'),
106
+ Language('ba', 'Bashkir'),
107
+ Language('jw', 'Javanese'),
108
+ Language('su', 'Sundanese')
109
+ ]
110
+
111
+ _TO_LANGUAGE_CODE = {
112
+ **{language.code: language for language in LANGUAGES},
113
+ "burmese": "my",
114
+ "valencian": "ca",
115
+ "flemish": "nl",
116
+ "haitian": "ht",
117
+ "letzeburgesch": "lb",
118
+ "pushto": "ps",
119
+ "panjabi": "pa",
120
+ "moldavian": "ro",
121
+ "moldovan": "ro",
122
+ "sinhalese": "si",
123
+ "castilian": "es",
124
+ }
125
+
126
+ _FROM_LANGUAGE_NAME = {
127
+ **{language.name.lower(): language for language in LANGUAGES}
128
+ }
129
+
130
+ def get_language_from_code(language_code, default=None) -> Language:
131
+ """Return the language name from the language code."""
132
+ return _TO_LANGUAGE_CODE.get(language_code, default)
133
+
134
+ def get_language_from_name(language, default=None) -> Language:
135
+ """Return the language code from the language name."""
136
+ return _FROM_LANGUAGE_NAME.get(language.lower() if language else None, default)
137
+
138
+ def get_language_names():
139
+ """Return a list of language names."""
140
+ return [language.name for language in LANGUAGES]
141
+
142
+ if __name__ == "__main__":
143
+ # Test lookup
144
+ print(get_language_from_code('en'))
145
+ print(get_language_from_name('English'))
146
+
147
+ print(get_language_names())
src/whisper/fasterWhisperContainer.py CHANGED
@@ -4,6 +4,7 @@ from typing import List, Union
4
  from faster_whisper import WhisperModel, download_model
5
  from src.config import ModelConfig
6
  from src.hooks.progressListener import ProgressListener
 
7
  from src.modelCache import ModelCache
8
  from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
9
 
@@ -179,24 +180,9 @@ class FasterWhisperCallback(AbstractWhisperCallback):
179
  return [int(token) for token in suppress_tokens.split(",")]
180
 
181
  def _lookup_language_code(self, language: str):
182
- lookup = {
183
- "english": "en", "chinese": "zh", "german": "de", "spanish": "es", "russian": "ru", "korean": "ko",
184
- "french": "fr", "japanese": "ja", "portuguese": "pt", "turkish": "tr", "polish": "pl", "catalan": "ca",
185
- "dutch": "nl", "arabic": "ar", "swedish": "sv", "italian": "it", "indonesian": "id", "hindi": "hi",
186
- "finnish": "fi", "vietnamese": "vi", "hebrew": "he", "ukrainian": "uk", "greek": "el", "malay": "ms",
187
- "czech": "cs", "romanian": "ro", "danish": "da", "hungarian": "hu", "tamil": "ta", "norwegian": "no",
188
- "thai": "th", "urdu": "ur", "croatian": "hr", "bulgarian": "bg", "lithuanian": "lt", "latin": "la",
189
- "maori": "mi", "malayalam": "ml", "welsh": "cy", "slovak": "sk", "telugu": "te", "persian": "fa",
190
- "latvian": "lv", "bengali": "bn", "serbian": "sr", "azerbaijani": "az", "slovenian": "sl",
191
- "kannada": "kn", "estonian": "et", "macedonian": "mk", "breton": "br", "basque": "eu", "icelandic": "is",
192
- "armenian": "hy", "nepali": "ne", "mongolian": "mn", "bosnian": "bs", "kazakh": "kk", "albanian": "sq",
193
- "swahili": "sw", "galician": "gl", "marathi": "mr", "punjabi": "pa", "sinhala": "si", "khmer": "km",
194
- "shona": "sn", "yoruba": "yo", "somali": "so", "afrikaans": "af", "occitan": "oc", "georgian": "ka",
195
- "belarusian": "be", "tajik": "tg", "sindhi": "sd", "gujarati": "gu", "amharic": "am", "yiddish": "yi",
196
- "lao": "lo", "uzbek": "uz", "faroese": "fo", "haitian creole": "ht", "pashto": "ps", "turkmen": "tk",
197
- "nynorsk": "nn", "maltese": "mt", "sanskrit": "sa", "luxembourgish": "lb", "myanmar": "my", "tibetan": "bo",
198
- "tagalog": "tl", "malagasy": "mg", "assamese": "as", "tatar": "tt", "hawaiian": "haw", "lingala": "ln",
199
- "hausa": "ha", "bashkir": "ba", "javanese": "jv", "sundanese": "su"
200
- }
201
 
202
- return lookup.get(language.lower() if language is not None else None, language)
 
 
 
 
4
  from faster_whisper import WhisperModel, download_model
5
  from src.config import ModelConfig
6
  from src.hooks.progressListener import ProgressListener
7
+ from src.languages import get_language_from_name
8
  from src.modelCache import ModelCache
9
  from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
10
 
 
180
  return [int(token) for token in suppress_tokens.split(",")]
181
 
182
  def _lookup_language_code(self, language: str):
183
+ language = get_language_from_name(language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ if language is None:
186
+ raise ValueError("Invalid language: " + language)
187
+
188
+ return language.code