github-actions[bot] commited on
Commit
b399068
·
1 Parent(s): 5d20e7d

Sync with https://github.com/mozilla-ai/speech-to-text-finetune

Browse files
Files changed (1) hide show
  1. app.py +74 -50
app.py CHANGED
@@ -1,22 +1,28 @@
1
  import os
 
 
2
  import gradio as gr
3
- import spaces
4
  from transformers import pipeline, Pipeline
 
 
 
 
5
 
6
  is_hf_space = os.getenv("IS_HF_SPACE")
 
7
  model_ids = [
8
  "",
9
- "mozilla-ai/whisper-small-gl (Galician)",
10
- "mozilla-ai/whisper-small-el (Greek)",
11
- "openai/whisper-tiny (Multilingual)",
12
- "openai/whisper-small (Multilingual)",
13
- "openai/whisper-medium (Multilingual)",
14
- "openai/whisper-large-v3 (Multilingual)",
15
- "openai/whisper-large-v3-turbo (Multilingual)",
16
  ]
17
 
18
 
19
- def _load_local_model(model_dir: str) -> Pipeline:
 
 
20
  from transformers import (
21
  WhisperProcessor,
22
  WhisperTokenizer,
@@ -25,53 +31,56 @@ def _load_local_model(model_dir: str) -> Pipeline:
25
  )
26
 
27
  processor = WhisperProcessor.from_pretrained(model_dir)
28
- tokenizer = WhisperTokenizer.from_pretrained(model_dir, task="transcribe")
 
 
29
  feature_extractor = WhisperFeatureExtractor.from_pretrained(model_dir)
30
  model = WhisperForConditionalGeneration.from_pretrained(model_dir)
31
 
32
- try:
33
- return pipeline(
34
- task="automatic-speech-recognition",
35
- model=model,
36
- processor=processor,
37
- tokenizer=tokenizer,
38
- feature_extractor=feature_extractor,
39
- )
40
- except Exception as e:
41
- return str(e)
42
 
43
 
44
- def _load_hf_model(model_repo_id: str) -> Pipeline:
45
- try:
46
- return pipeline(
47
- "automatic-speech-recognition",
48
- model=model_repo_id,
49
  )
50
- except Exception as e:
51
- return str(e)
 
 
 
52
 
53
 
54
- @spaces.GPU(duration=30)
55
- def transcribe(
56
- dropdown_model_id: str,
57
- hf_model_id: str,
58
- local_model_id: str,
59
- audio: gr.Audio,
60
- ) -> str:
61
  if dropdown_model_id and not hf_model_id and not local_model_id:
62
- dropdown_model_id = dropdown_model_id.split(" (")[0]
63
- pipe = _load_hf_model(dropdown_model_id)
64
  elif hf_model_id and not local_model_id and not dropdown_model_id:
65
- pipe = _load_hf_model(hf_model_id)
 
66
  elif local_model_id and not hf_model_id and not dropdown_model_id:
67
- pipe = _load_local_model(local_model_id)
 
68
  else:
69
- return (
70
- "⚠️ Error: Please select or fill at least and only one of the options above"
 
71
  )
72
- if isinstance(pipe, str):
73
- # Exception raised when loading
74
- return f"⚠️ Error: {pipe}"
 
 
75
  text = pipe(audio)["text"]
76
  return text
77
 
@@ -80,12 +89,18 @@ def setup_gradio_demo():
80
  with gr.Blocks() as demo:
81
  gr.Markdown(
82
  """ # 🗣️ Speech-to-Text Transcription
83
- ### 1. Select which model to use from one of the options below.
84
- ### 2. Record a message or upload an audio file.
85
- ### 3. Click Transcribe to see the transcription generated by the model.
 
 
86
  """
87
  )
88
- ### Model selection ###
 
 
 
 
89
 
90
  with gr.Row():
91
  with gr.Column():
@@ -103,6 +118,9 @@ def setup_gradio_demo():
103
  placeholder="artifacts/my-whisper-tiny",
104
  )
105
 
 
 
 
106
  ### Transcription ###
107
  audio_input = gr.Audio(
108
  sources=["microphone", "upload"],
@@ -114,10 +132,16 @@ def setup_gradio_demo():
114
  transcribe_button = gr.Button("Transcribe")
115
  transcribe_output = gr.Text(label="Output")
116
 
 
 
 
 
 
 
 
 
117
  transcribe_button.click(
118
- fn=transcribe,
119
- inputs=[dropdown_model, user_model, local_model, audio_input],
120
- outputs=transcribe_output,
121
  )
122
 
123
  demo.launch()
 
1
  import os
2
+ from pathlib import Path
3
+ from typing import Tuple
4
  import gradio as gr
 
5
  from transformers import pipeline, Pipeline
6
+ from huggingface_hub import repo_exists
7
+
8
+
9
+ from speech_to_text_finetune.config import LANGUAGES_NAME_TO_ID
10
 
11
  is_hf_space = os.getenv("IS_HF_SPACE")
12
+ languages = LANGUAGES_NAME_TO_ID.keys()
13
  model_ids = [
14
  "",
15
+ "openai/whisper-tiny",
16
+ "openai/whisper-small",
17
+ "openai/whisper-medium",
18
+ "openai/whisper-large-v3",
19
+ "openai/whisper-large-v3-turbo",
 
 
20
  ]
21
 
22
 
23
+ def _load_local_model(model_dir: str, language: str) -> Tuple[Pipeline | None, str]:
24
+ if not Path(model_dir).is_dir():
25
+ return None, f"⚠️ Couldn't find local model directory: {model_dir}"
26
  from transformers import (
27
  WhisperProcessor,
28
  WhisperTokenizer,
 
31
  )
32
 
33
  processor = WhisperProcessor.from_pretrained(model_dir)
34
+ tokenizer = WhisperTokenizer.from_pretrained(
35
+ model_dir, language=language, task="transcribe"
36
+ )
37
  feature_extractor = WhisperFeatureExtractor.from_pretrained(model_dir)
38
  model = WhisperForConditionalGeneration.from_pretrained(model_dir)
39
 
40
+ return pipeline(
41
+ task="automatic-speech-recognition",
42
+ model=model,
43
+ processor=processor,
44
+ tokenizer=tokenizer,
45
+ feature_extractor=feature_extractor,
46
+ ), f"✅ Local model has been loaded from {model_dir}."
 
 
 
47
 
48
 
49
+ def _load_hf_model(model_repo_id: str, language: str) -> Tuple[Pipeline | None, str]:
50
+ if not repo_exists(model_repo_id):
51
+ return (
52
+ None,
53
+ f"⚠️ Couldn't find {model_repo_id} on Hugging Face. If its a private repo, make sure you are logged in locally.",
54
  )
55
+ return pipeline(
56
+ "automatic-speech-recognition",
57
+ model=model_repo_id,
58
+ generate_kwargs={"language": language},
59
+ ), f"✅ HF Model {model_repo_id} has been loaded."
60
 
61
 
62
+ def load_model(
63
+ language: str, dropdown_model_id: str, hf_model_id: str, local_model_id: str
64
+ ) -> Tuple[Pipeline, str]:
 
 
 
 
65
  if dropdown_model_id and not hf_model_id and not local_model_id:
66
+ yield None, f"Loading {dropdown_model_id}..."
67
+ yield _load_hf_model(dropdown_model_id, language)
68
  elif hf_model_id and not local_model_id and not dropdown_model_id:
69
+ yield None, f"Loading {hf_model_id}..."
70
+ yield _load_hf_model(hf_model_id, language)
71
  elif local_model_id and not hf_model_id and not dropdown_model_id:
72
+ yield None, f"Loading {local_model_id}..."
73
+ yield _load_local_model(local_model_id, language)
74
  else:
75
+ yield (
76
+ None,
77
+ "️️⚠️ Please select or fill at least and only one of the options above",
78
  )
79
+ if not language:
80
+ yield None, "⚠️ Please select a language from the dropdown"
81
+
82
+
83
+ def transcribe(pipe: Pipeline, audio: gr.Audio) -> str:
84
  text = pipe(audio)["text"]
85
  return text
86
 
 
89
  with gr.Blocks() as demo:
90
  gr.Markdown(
91
  """ # 🗣️ Speech-to-Text Transcription
92
+ ### 1. Select a language from the dropdown menu.
93
+ ### 2. Select which model to load from one of the options below.
94
+ ### 3. Load the model by clicking the Load model button.
95
+ ### 4. Record a message or upload an audio file.
96
+ ### 5. Click Transcribe to see the transcription generated by the model.
97
  """
98
  )
99
+ ### Language & Model selection ###
100
+
101
+ selected_lang = gr.Dropdown(
102
+ choices=list(languages), value=None, label="Select a language"
103
+ )
104
 
105
  with gr.Row():
106
  with gr.Column():
 
118
  placeholder="artifacts/my-whisper-tiny",
119
  )
120
 
121
+ load_model_button = gr.Button("Load model")
122
+ model_loaded = gr.Markdown()
123
+
124
  ### Transcription ###
125
  audio_input = gr.Audio(
126
  sources=["microphone", "upload"],
 
132
  transcribe_button = gr.Button("Transcribe")
133
  transcribe_output = gr.Text(label="Output")
134
 
135
+ ### Event listeners ###
136
+ model = gr.State()
137
+ load_model_button.click(
138
+ fn=load_model,
139
+ inputs=[selected_lang, dropdown_model, user_model, local_model],
140
+ outputs=[model, model_loaded],
141
+ )
142
+
143
  transcribe_button.click(
144
+ fn=transcribe, inputs=[model, audio_input], outputs=transcribe_output
 
 
145
  )
146
 
147
  demo.launch()