Spaces:

Matthijs
/

whisper_word_timestamps

Runtime error

App Files Files Community

Matthijs Hollemans commited on Jun 29, 2023

Commit

d2d20b7

1 Parent(s): 4ba2008

add language selector

Browse files

Files changed (2) hide show

app.py +135 -5
examples/johan_cruijff.mp3 +3 -0

app.py CHANGED Viewed

@@ -39,6 +39,126 @@ font = ImageFont.truetype("Lato-Regular.ttf", 40)
 text_color = (255, 200, 200)
 highlight_color = (255, 255, 255)
 if torch.cuda.is_available() and torch.cuda.device_count() > 0:
     from transformers import (
         AutomaticSpeechRecognitionPipeline,
@@ -129,7 +249,7 @@ def make_frame(t):
     return last_image
-def predict(audio_path):
     global chunks, start_chunk, last_draws, last_image
     start_chunk = 0
@@ -141,6 +261,14 @@ def predict(audio_path):
     duration = min(max_duration, duration)
     audio_data = audio_data[:int(duration * sr)]
     # Run Whisper to get word-level timestamps.
     audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
     output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
@@ -185,16 +313,18 @@ article = """
 """
 examples = [
-    "examples/steve_jobs_crazy_ones.mp3",
-    "examples/henry5.wav",
-    "examples/stupid_people.mp3",
-    "examples/beos_song.mp3",
 ]
 gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(label="Upload Audio", source="upload", type="filepath"),
     ],
     outputs=[
         gr.Video(label="Output Video"),

 text_color = (255, 200, 200)
 highlight_color = (255, 255, 255)
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+}
 if torch.cuda.is_available() and torch.cuda.device_count() > 0:
     from transformers import (
         AutomaticSpeechRecognitionPipeline,
     return last_image
+def predict(audio_path, language=None):
     global chunks, start_chunk, last_draws, last_image
     start_chunk = 0
     duration = min(max_duration, duration)
     audio_data = audio_data[:int(duration * sr)]
+    if language is not None:
+        pipe.model.config.forced_decoder_ids = (
+            pipe.tokenizer.get_decoder_prompt_ids(
+                language=language,
+                task="transcribe"
+            )
+        )
     # Run Whisper to get word-level timestamps.
     audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
     output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
 """
 examples = [
+    ["examples/steve_jobs_crazy_ones.mp3", "english"],
+    ["examples/henry5.wav", "english"],
+    ["examples/stupid_people.mp3", "english"],
+    ["examples/beos_song.mp3", "english"],
+    ["examples/johan_cruijff.mp3", "dutch"],
 ]
 gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(label="Upload Audio", source="upload", type="filepath"),
+        gr.Dropdown(label="Language", choices=sorted(list(TO_LANGUAGE_CODE.keys()))),
     ],
     outputs=[
         gr.Video(label="Output Video"),

examples/johan_cruijff.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c72e1b91bf3aa612422611b4e5c00154b19a0c3bc68c165a06fb9a3ae3f3bef
+size 96059