Lakoc commited on
Commit
8c54c03
1 Parent(s): 2362603

Initial commit

Browse files
Files changed (4) hide show
  1. .idea/.gitignore +8 -0
  2. app.py +50 -34
  3. requirements.txt +1 -1
  4. whisper_notebook.ipynb +0 -192
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
app.py CHANGED
@@ -8,29 +8,47 @@ from transformers.pipelines.audio_utils import ffmpeg_read
8
 
9
  import tempfile
10
  import os
 
 
 
 
 
11
 
12
- MODEL_NAME = "openai/whisper-large-v3-turbo"
13
  BATCH_SIZE = 8
14
  FILE_LIMIT_MB = 1000
15
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
16
 
17
  device = 0 if torch.cuda.is_available() else "cpu"
18
 
19
- pipe = pipeline(
20
- task="automatic-speech-recognition",
21
- model=MODEL_NAME,
22
- chunk_length_s=30,
23
- device=device,
24
- )
 
 
 
 
 
 
 
 
 
 
 
25
 
26
 
27
  @spaces.GPU
28
- def transcribe(inputs, task):
29
  if inputs is None:
30
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
31
 
32
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
33
- return text
 
 
 
34
 
35
 
36
  def _return_yt_html_embed(yt_url):
@@ -41,41 +59,46 @@ def _return_yt_html_embed(yt_url):
41
  )
42
  return HTML_str
43
 
 
44
  def download_yt_audio(yt_url, filename):
45
  info_loader = youtube_dl.YoutubeDL()
46
-
47
  try:
48
  info = info_loader.extract_info(yt_url, download=False)
49
  except youtube_dl.utils.DownloadError as err:
50
  raise gr.Error(str(err))
51
-
52
  file_length = info["duration_string"]
53
  file_h_m_s = file_length.split(":")
54
  file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
55
-
56
  if len(file_h_m_s) == 1:
57
  file_h_m_s.insert(0, 0)
58
  if len(file_h_m_s) == 2:
59
  file_h_m_s.insert(0, 0)
60
  file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
61
-
62
  if file_length_s > YT_LENGTH_LIMIT_S:
63
  yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
64
  file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
65
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
66
-
67
  ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
68
-
69
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
70
  try:
71
  ydl.download([yt_url])
72
  except youtube_dl.utils.ExtractorError as err:
73
  raise gr.Error(str(err))
74
 
 
75
  @spaces.GPU
76
- def yt_transcribe(yt_url, task, max_filesize=75.0):
77
  html_embed_str = _return_yt_html_embed(yt_url)
78
 
 
 
 
79
  with tempfile.TemporaryDirectory() as tmpdirname:
80
  filepath = os.path.join(tmpdirname, "video.mp4")
81
  download_yt_audio(yt_url, filepath)
@@ -85,7 +108,7 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
85
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
86
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
87
 
88
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
89
 
90
  return html_embed_str, text
91
 
@@ -96,14 +119,12 @@ mf_transcribe = gr.Interface(
96
  fn=transcribe,
97
  inputs=[
98
  gr.Audio(sources="microphone", type="filepath"),
99
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
100
  ],
101
  outputs="text",
102
- title="Whisper Large V3 Turbo: Transcribe Audio",
103
  description=(
104
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
105
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
106
- " of arbitrary length."
107
  ),
108
  allow_flagging="never",
109
  )
@@ -112,14 +133,12 @@ file_transcribe = gr.Interface(
112
  fn=transcribe,
113
  inputs=[
114
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
115
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
116
  ],
117
  outputs="text",
118
- title="Whisper Large V3: Transcribe Audio",
119
  description=(
120
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
121
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
122
- " of arbitrary length."
123
  ),
124
  allow_flagging="never",
125
  )
@@ -128,14 +147,12 @@ yt_transcribe = gr.Interface(
128
  fn=yt_transcribe,
129
  inputs=[
130
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
131
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
132
  ],
133
  outputs=["html", "text"],
134
- title="Whisper Large V3: Transcribe YouTube",
135
  description=(
136
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
137
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
138
- " arbitrary length."
139
  ),
140
  allow_flagging="never",
141
  )
@@ -144,4 +161,3 @@ with demo:
144
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
145
 
146
  demo.queue().launch(ssr_mode=False)
147
-
 
8
 
9
  import tempfile
10
  import os
11
+ import time
12
+
13
+ # Available models to choose from
14
+ MODEL_OPTIONS = ["BUT-FIT/DeCRED-base", "BUT-FIT/DeCRED-small", "BUT-FIT/ED-base", "BUT-FIT/ED-small"]
15
+ DEFAULT_MODEL = MODEL_OPTIONS[0]
16
 
 
17
  BATCH_SIZE = 8
18
  FILE_LIMIT_MB = 1000
19
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
20
 
21
  device = 0 if torch.cuda.is_available() else "cpu"
22
 
23
+
24
+ # Function to initialize pipeline based on model selection
25
+ def initialize_pipeline(model_name):
26
+ pipe = pipeline(
27
+ task="automatic-speech-recognition",
28
+ model=model_name,
29
+ feature_extractor=model_name,
30
+ chunk_length_s=30,
31
+ device=device,
32
+ trust_remote_code=True
33
+ )
34
+ pipe.type = "seq2seq"
35
+ return pipe
36
+
37
+ # Initialize the pipeline with a default model (it will be updated after user selects one)
38
+ pipe = initialize_pipeline(DEFAULT_MODEL)
39
+ pipe.type = "seq2seq"
40
 
41
 
42
  @spaces.GPU
43
+ def transcribe(inputs, selected_model):
44
  if inputs is None:
45
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
46
 
47
+ # Update the pipeline with the selected model
48
+ pipe = initialize_pipeline(selected_model)
49
+
50
+ text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
51
+ return text
52
 
53
 
54
  def _return_yt_html_embed(yt_url):
 
59
  )
60
  return HTML_str
61
 
62
+
63
  def download_yt_audio(yt_url, filename):
64
  info_loader = youtube_dl.YoutubeDL()
65
+
66
  try:
67
  info = info_loader.extract_info(yt_url, download=False)
68
  except youtube_dl.utils.DownloadError as err:
69
  raise gr.Error(str(err))
70
+
71
  file_length = info["duration_string"]
72
  file_h_m_s = file_length.split(":")
73
  file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
74
+
75
  if len(file_h_m_s) == 1:
76
  file_h_m_s.insert(0, 0)
77
  if len(file_h_m_s) == 2:
78
  file_h_m_s.insert(0, 0)
79
  file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
80
+
81
  if file_length_s > YT_LENGTH_LIMIT_S:
82
  yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
83
  file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
84
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
85
+
86
  ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
87
+
88
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
89
  try:
90
  ydl.download([yt_url])
91
  except youtube_dl.utils.ExtractorError as err:
92
  raise gr.Error(str(err))
93
 
94
+
95
  @spaces.GPU
96
+ def yt_transcribe(yt_url, selected_model, max_filesize=75.0):
97
  html_embed_str = _return_yt_html_embed(yt_url)
98
 
99
+ # Update the pipeline with the selected model
100
+ pipe = initialize_pipeline(selected_model)
101
+
102
  with tempfile.TemporaryDirectory() as tmpdirname:
103
  filepath = os.path.join(tmpdirname, "video.mp4")
104
  download_yt_audio(yt_url, filepath)
 
108
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
109
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
110
 
111
+ text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
112
 
113
  return html_embed_str, text
114
 
 
119
  fn=transcribe,
120
  inputs=[
121
  gr.Audio(sources="microphone", type="filepath"),
122
+ gr.Dropdown(choices=MODEL_OPTIONS, label="Model", value=DEFAULT_MODEL)
123
  ],
124
  outputs="text",
125
+ title="Transcribe Audio",
126
  description=(
127
+ "Transcribe long-form microphone or audio inputs with the click of a button! Select a model from the dropdown."
 
 
128
  ),
129
  allow_flagging="never",
130
  )
 
133
  fn=transcribe,
134
  inputs=[
135
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
136
+ gr.Dropdown(choices=MODEL_OPTIONS, label="Model", value=DEFAULT_MODEL)
137
  ],
138
  outputs="text",
139
+ title="Transcribe Audio",
140
  description=(
141
+ "Transcribe audio files with the click of a button! Select a model from the dropdown."
 
 
142
  ),
143
  allow_flagging="never",
144
  )
 
147
  fn=yt_transcribe,
148
  inputs=[
149
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
150
+ gr.Dropdown(choices=MODEL_OPTIONS, label="Model", value=DEFAULT_MODEL)
151
  ],
152
  outputs=["html", "text"],
153
+ title="Transcribe YouTube",
154
  description=(
155
+ "Transcribe long-form YouTube videos with the click of a button! Select a model from the dropdown."
 
 
156
  ),
157
  allow_flagging="never",
158
  )
 
161
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
162
 
163
  demo.queue().launch(ssr_mode=False)
 
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
- transformers
2
  yt-dlp
 
1
+ transformers==4.39.3
2
  yt-dlp
whisper_notebook.ipynb DELETED
@@ -1,192 +0,0 @@
1
- {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "gpuType": "T4"
8
- },
9
- "kernelspec": {
10
- "name": "python3",
11
- "display_name": "Python 3"
12
- },
13
- "language_info": {
14
- "name": "python"
15
- },
16
- "accelerator": "GPU"
17
- },
18
- "cells": [
19
- {
20
- "cell_type": "markdown",
21
- "source": [
22
- "# Whisper v3 is here!\n",
23
- "\n",
24
- "Whisper v3 is a new model open sourced by OpenAI. The model can do multilingual transcriptions and is quite impressive. For example, you can change from English to Spanish or Chinese in the middle of a sentence and it will work well!\n",
25
- "\n",
26
- "The model can be run in a free Google Colab instance and is integrated into `transformers` already, so switching can be a very smooth process if you already use the previous versions."
27
- ],
28
- "metadata": {
29
- "id": "OXaUqiE-eyXM"
30
- }
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": null,
35
- "metadata": {
36
- "id": "WFQeUT9EcIcK"
37
- },
38
- "outputs": [],
39
- "source": [
40
- "%%capture\n",
41
- "!pip install git+https://github.com/huggingface/transformers gradio"
42
- ]
43
- },
44
- {
45
- "cell_type": "markdown",
46
- "source": [
47
- "Let's use the high level `pipeline` from the `transformers` library to load the model."
48
- ],
49
- "metadata": {
50
- "id": "sZONes21fHTA"
51
- }
52
- },
53
- {
54
- "cell_type": "code",
55
- "source": [
56
- "import torch\n",
57
- "from transformers import pipeline\n",
58
- "\n",
59
- "pipe = pipeline(\"automatic-speech-recognition\",\n",
60
- " \"openai/whisper-large-v3\",\n",
61
- " torch_dtype=torch.float16,\n",
62
- " device=\"cuda:0\")"
63
- ],
64
- "metadata": {
65
- "colab": {
66
- "base_uri": "https://localhost:8080/"
67
- },
68
- "id": "DvBdwMdPcr-Y",
69
- "outputId": "47f32218-fd85-49ea-d880-d31577bcf9b8"
70
- },
71
- "execution_count": null,
72
- "outputs": [
73
- {
74
- "output_type": "stream",
75
- "name": "stderr",
76
- "text": [
77
- "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
78
- "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
79
- ]
80
- }
81
- ]
82
- },
83
- {
84
- "cell_type": "code",
85
- "source": [
86
- "pipe(\"https://cdn-media.huggingface.co/speech_samples/sample1.flac\")"
87
- ],
88
- "metadata": {
89
- "colab": {
90
- "base_uri": "https://localhost:8080/"
91
- },
92
- "id": "GZFkIyhjc0Nc",
93
- "outputId": "f1463431-3e08-4438-815f-b71e5e7a1503"
94
- },
95
- "execution_count": null,
96
- "outputs": [
97
- {
98
- "output_type": "execute_result",
99
- "data": {
100
- "text/plain": [
101
- "{'text': \" going along slushy country roads and speaking to damp audiences in draughty schoolrooms day after day for a fortnight he'll have to put in an appearance at some place of worship on sunday morning and he can come to us immediately afterwards\"}"
102
- ]
103
- },
104
- "metadata": {},
105
- "execution_count": 2
106
- }
107
- ]
108
- },
109
- {
110
- "cell_type": "markdown",
111
- "source": [
112
- "Let's now build a quick Gradio demo where we can play with the model directly using our microphone! You can run this code in a Google Colab instance (or locally!) or just head to the <a href=\"https://huggingface.co/spaces/hf-audio/whisper-large-v3\" target=\"_blank\">Space</a> to play directly with it online."
113
- ],
114
- "metadata": {
115
- "id": "pt3YtM_PfTQY"
116
- }
117
- },
118
- {
119
- "cell_type": "code",
120
- "source": [
121
- "import gradio as gr\n",
122
- "\n",
123
- "def transcribe(inputs):\n",
124
- " if inputs is None:\n",
125
- " raise gr.Error(\"No audio file submitted! Please record an audio before submitting your request.\")\n",
126
- "\n",
127
- " text = pipe(inputs, generate_kwargs={\"task\": \"transcribe\"}, return_timestamps=True)[\"text\"]\n",
128
- " return text\n",
129
- "\n",
130
- "demo = gr.Interface(\n",
131
- " fn=transcribe,\n",
132
- " inputs=[\n",
133
- " gr.Audio(sources=[\"microphone\", \"upload\"], type=\"filepath\"),\n",
134
- " ],\n",
135
- " outputs=\"text\",\n",
136
- " title=\"Whisper Large V3: Transcribe Audio\",\n",
137
- " description=(\n",
138
- " \"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the\"\n",
139
- " \" checkpoint [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) and 🤗 Transformers to transcribe audio files\"\n",
140
- " \" of arbitrary length.\"\n",
141
- " ),\n",
142
- " allow_flagging=\"never\",\n",
143
- ")\n",
144
- "\n",
145
- "demo.launch()\n"
146
- ],
147
- "metadata": {
148
- "colab": {
149
- "base_uri": "https://localhost:8080/",
150
- "height": 648
151
- },
152
- "id": "K0b2UZLVdIze",
153
- "outputId": "bcff00e0-4fc8-4883-9ba4-480f5a6665f0"
154
- },
155
- "execution_count": null,
156
- "outputs": [
157
- {
158
- "output_type": "stream",
159
- "name": "stdout",
160
- "text": [
161
- "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n",
162
- "\n",
163
- "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n",
164
- "Running on public URL: https://037dbdb04542aa1a29.gradio.live\n",
165
- "\n",
166
- "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
167
- ]
168
- },
169
- {
170
- "output_type": "display_data",
171
- "data": {
172
- "text/plain": [
173
- "<IPython.core.display.HTML object>"
174
- ],
175
- "text/html": [
176
- "<div><iframe src=\"https://037dbdb04542aa1a29.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
177
- ]
178
- },
179
- "metadata": {}
180
- },
181
- {
182
- "output_type": "execute_result",
183
- "data": {
184
- "text/plain": []
185
- },
186
- "metadata": {},
187
- "execution_count": 4
188
- }
189
- ]
190
- }
191
- ]
192
- }