Matthijs Hollemans commited on
Commit
d2d20b7
·
1 Parent(s): 4ba2008

add language selector

Browse files
Files changed (2) hide show
  1. app.py +135 -5
  2. examples/johan_cruijff.mp3 +3 -0
app.py CHANGED
@@ -39,6 +39,126 @@ font = ImageFont.truetype("Lato-Regular.ttf", 40)
39
  text_color = (255, 200, 200)
40
  highlight_color = (255, 255, 255)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  if torch.cuda.is_available() and torch.cuda.device_count() > 0:
43
  from transformers import (
44
  AutomaticSpeechRecognitionPipeline,
@@ -129,7 +249,7 @@ def make_frame(t):
129
  return last_image
130
 
131
 
132
- def predict(audio_path):
133
  global chunks, start_chunk, last_draws, last_image
134
 
135
  start_chunk = 0
@@ -141,6 +261,14 @@ def predict(audio_path):
141
  duration = min(max_duration, duration)
142
  audio_data = audio_data[:int(duration * sr)]
143
 
 
 
 
 
 
 
 
 
144
  # Run Whisper to get word-level timestamps.
145
  audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
146
  output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
@@ -185,16 +313,18 @@ article = """
185
  """
186
 
187
  examples = [
188
- "examples/steve_jobs_crazy_ones.mp3",
189
- "examples/henry5.wav",
190
- "examples/stupid_people.mp3",
191
- "examples/beos_song.mp3",
 
192
  ]
193
 
194
  gr.Interface(
195
  fn=predict,
196
  inputs=[
197
  gr.Audio(label="Upload Audio", source="upload", type="filepath"),
 
198
  ],
199
  outputs=[
200
  gr.Video(label="Output Video"),
 
39
  text_color = (255, 200, 200)
40
  highlight_color = (255, 255, 255)
41
 
42
+
43
+ LANGUAGES = {
44
+ "en": "english",
45
+ "zh": "chinese",
46
+ "de": "german",
47
+ "es": "spanish",
48
+ "ru": "russian",
49
+ "ko": "korean",
50
+ "fr": "french",
51
+ "ja": "japanese",
52
+ "pt": "portuguese",
53
+ "tr": "turkish",
54
+ "pl": "polish",
55
+ "ca": "catalan",
56
+ "nl": "dutch",
57
+ "ar": "arabic",
58
+ "sv": "swedish",
59
+ "it": "italian",
60
+ "id": "indonesian",
61
+ "hi": "hindi",
62
+ "fi": "finnish",
63
+ "vi": "vietnamese",
64
+ "he": "hebrew",
65
+ "uk": "ukrainian",
66
+ "el": "greek",
67
+ "ms": "malay",
68
+ "cs": "czech",
69
+ "ro": "romanian",
70
+ "da": "danish",
71
+ "hu": "hungarian",
72
+ "ta": "tamil",
73
+ "no": "norwegian",
74
+ "th": "thai",
75
+ "ur": "urdu",
76
+ "hr": "croatian",
77
+ "bg": "bulgarian",
78
+ "lt": "lithuanian",
79
+ "la": "latin",
80
+ "mi": "maori",
81
+ "ml": "malayalam",
82
+ "cy": "welsh",
83
+ "sk": "slovak",
84
+ "te": "telugu",
85
+ "fa": "persian",
86
+ "lv": "latvian",
87
+ "bn": "bengali",
88
+ "sr": "serbian",
89
+ "az": "azerbaijani",
90
+ "sl": "slovenian",
91
+ "kn": "kannada",
92
+ "et": "estonian",
93
+ "mk": "macedonian",
94
+ "br": "breton",
95
+ "eu": "basque",
96
+ "is": "icelandic",
97
+ "hy": "armenian",
98
+ "ne": "nepali",
99
+ "mn": "mongolian",
100
+ "bs": "bosnian",
101
+ "kk": "kazakh",
102
+ "sq": "albanian",
103
+ "sw": "swahili",
104
+ "gl": "galician",
105
+ "mr": "marathi",
106
+ "pa": "punjabi",
107
+ "si": "sinhala",
108
+ "km": "khmer",
109
+ "sn": "shona",
110
+ "yo": "yoruba",
111
+ "so": "somali",
112
+ "af": "afrikaans",
113
+ "oc": "occitan",
114
+ "ka": "georgian",
115
+ "be": "belarusian",
116
+ "tg": "tajik",
117
+ "sd": "sindhi",
118
+ "gu": "gujarati",
119
+ "am": "amharic",
120
+ "yi": "yiddish",
121
+ "lo": "lao",
122
+ "uz": "uzbek",
123
+ "fo": "faroese",
124
+ "ht": "haitian creole",
125
+ "ps": "pashto",
126
+ "tk": "turkmen",
127
+ "nn": "nynorsk",
128
+ "mt": "maltese",
129
+ "sa": "sanskrit",
130
+ "lb": "luxembourgish",
131
+ "my": "myanmar",
132
+ "bo": "tibetan",
133
+ "tl": "tagalog",
134
+ "mg": "malagasy",
135
+ "as": "assamese",
136
+ "tt": "tatar",
137
+ "haw": "hawaiian",
138
+ "ln": "lingala",
139
+ "ha": "hausa",
140
+ "ba": "bashkir",
141
+ "jw": "javanese",
142
+ "su": "sundanese",
143
+ }
144
+
145
+ # language code lookup by name, with a few language aliases
146
+ TO_LANGUAGE_CODE = {
147
+ **{language: code for code, language in LANGUAGES.items()},
148
+ "burmese": "my",
149
+ "valencian": "ca",
150
+ "flemish": "nl",
151
+ "haitian": "ht",
152
+ "letzeburgesch": "lb",
153
+ "pushto": "ps",
154
+ "panjabi": "pa",
155
+ "moldavian": "ro",
156
+ "moldovan": "ro",
157
+ "sinhalese": "si",
158
+ "castilian": "es",
159
+ }
160
+
161
+
162
  if torch.cuda.is_available() and torch.cuda.device_count() > 0:
163
  from transformers import (
164
  AutomaticSpeechRecognitionPipeline,
 
249
  return last_image
250
 
251
 
252
+ def predict(audio_path, language=None):
253
  global chunks, start_chunk, last_draws, last_image
254
 
255
  start_chunk = 0
 
261
  duration = min(max_duration, duration)
262
  audio_data = audio_data[:int(duration * sr)]
263
 
264
+ if language is not None:
265
+ pipe.model.config.forced_decoder_ids = (
266
+ pipe.tokenizer.get_decoder_prompt_ids(
267
+ language=language,
268
+ task="transcribe"
269
+ )
270
+ )
271
+
272
  # Run Whisper to get word-level timestamps.
273
  audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
274
  output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
 
313
  """
314
 
315
  examples = [
316
+ ["examples/steve_jobs_crazy_ones.mp3", "english"],
317
+ ["examples/henry5.wav", "english"],
318
+ ["examples/stupid_people.mp3", "english"],
319
+ ["examples/beos_song.mp3", "english"],
320
+ ["examples/johan_cruijff.mp3", "dutch"],
321
  ]
322
 
323
  gr.Interface(
324
  fn=predict,
325
  inputs=[
326
  gr.Audio(label="Upload Audio", source="upload", type="filepath"),
327
+ gr.Dropdown(label="Language", choices=sorted(list(TO_LANGUAGE_CODE.keys()))),
328
  ],
329
  outputs=[
330
  gr.Video(label="Output Video"),
examples/johan_cruijff.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c72e1b91bf3aa612422611b4e5c00154b19a0c3bc68c165a06fb9a3ae3f3bef
3
+ size 96059