ovieyra21 commited on
Commit
104123a
β€’
1 Parent(s): 265bd42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -54
app.py CHANGED
@@ -1,27 +1,20 @@
1
- import os
2
  import torch
 
 
3
  import yt_dlp as youtube_dl
4
  import numpy as np
5
  from datasets import Dataset, Audio
6
  from scipy.io import wavfile
 
7
  from transformers import pipeline
 
 
8
  import tempfile
 
9
  import time
10
- import gradio as gr
11
-
12
- from huggingface_hub import login, HfApi
13
- # Ensure you have logged in to Hugging Face Hub
14
- HF_API_TOKEN = os.getenv('OAUTH_CLIENT_SECRET')
15
 
16
- css = """
17
- #intro{
18
- max-width: 100%;
19
- text-align: center;
20
- margin: 0 auto;
21
- }
22
- """
23
-
24
- MODEL_NAME = "openai/whisper-large-v3"
25
  DEMUCS_MODEL_NAME = "htdemucs_ft"
26
  BATCH_SIZE = 8
27
  FILE_LIMIT_MB = 1000
@@ -36,20 +29,78 @@ pipe = pipeline(
36
  device=device,
37
  )
38
 
39
- separator = demucs.api.Separator(model=DEMUCS_MODEL_NAME)
40
 
41
  def separate_vocal(path):
42
  origin, separated = separator.separate_audio_file(path)
43
  demucs.api.save_audio(separated["vocals"], path, samplerate=separator.samplerate)
44
  return path
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def _return_yt_html_embed(yt_url):
47
  video_id = yt_url.split("?v=")[-1]
48
  HTML_str = (
49
  f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
50
  " </center>"
51
  )
52
- return gr.HTML(value=HTML_str)
53
 
54
  def download_yt_audio(yt_url, filename):
55
  info_loader = youtube_dl.YoutubeDL()
@@ -70,8 +121,8 @@ def download_yt_audio(yt_url, filename):
70
  file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
71
 
72
  if file_length_s > YT_LENGTH_LIMIT_S:
73
- yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
74
- file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length_s))
75
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
76
 
77
  ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
@@ -82,31 +133,48 @@ def download_yt_audio(yt_url, filename):
82
  except youtube_dl.utils.ExtractorError as err:
83
  raise gr.Error(str(err))
84
 
85
- def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token, progress=gr.Progress()):
86
- if inputs_path is None:
87
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
88
- if dataset_name is None:
89
- raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format: <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
90
 
91
- if oauth_token is None:
92
- gr.Warning("Make sure to click and login before using this demo.")
93
- return ["transcripts will appear here"]
94
 
95
- total_step = 4
 
 
 
 
 
96
  current_step = 0
97
 
 
 
 
 
 
 
98
  current_step += 1
99
- progress((current_step, total_step), desc="Transcribe using Whisper.")
100
 
101
- sampling_rate, inputs = wavfile.read(inputs_path)
 
 
 
 
 
102
 
103
- out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
 
104
 
105
- text = out["text"]
 
 
 
 
 
 
106
 
107
  current_step += 1
108
  progress((current_step, total_step), desc="Merge chunks.")
109
- chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
110
 
111
  current_step += 1
112
  progress((current_step, total_step), desc="Create dataset.")
@@ -114,12 +182,16 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token, progres
114
  transcripts = []
115
  audios = []
116
  with tempfile.TemporaryDirectory() as tmpdirname:
117
- for i, chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for)")):
 
 
118
  arr = chunk["audio"]
119
  path = os.path.join(tmpdirname, f"{i}.wav")
120
- wavfile.write(path, sampling_rate, arr)
121
 
122
  if use_demucs == "separate-audio":
 
 
123
  path = separate_vocal(path)
124
 
125
  audios.append(path)
@@ -131,26 +203,75 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token, progres
131
  progress((current_step, total_step), desc="Push dataset.")
132
  dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
133
 
134
- return [[transcript] for transcript in transcripts], text
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  with gr.Blocks(css=css) as demo:
137
  with gr.Row():
138
  gr.LoginButton()
139
  gr.LogoutButton()
140
 
141
  with gr.Tab("YouTube"):
142
- gr.Markdown("Create your own TTS dataset using YouTube", elem_id="intro")
143
- gr.Markdown(
144
- "This demo allows you to create a text-to-speech dataset from an input audio snippet and push it to the hub to keep track of it."
145
- f" Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to automatically transcribe audio files"
146
- " of arbitrary length. It then merges chunks of audio and pushes them to the hub."
147
- )
148
  with gr.Row():
149
  with gr.Column():
150
  audio_youtube = gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
151
  task_youtube = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
152
- cleaning_youtube = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not clean (background noise and music))", value="separate-audio")
153
- textbox_youtube = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format: <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
154
 
155
  with gr.Row():
156
  clear_youtube = gr.ClearButton([audio_youtube, task_youtube, cleaning_youtube, textbox_youtube])
@@ -158,22 +279,22 @@ with gr.Blocks(css=css) as demo:
158
 
159
  with gr.Column():
160
  html_youtube = gr.HTML()
161
- dataset_youtube = gr.Dataset(label="Transcribed samples.", components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
162
  transcript_youtube = gr.Textbox(label="Transcription")
163
 
164
  with gr.Tab("Microphone or Audio file"):
165
  gr.Markdown("Create your own TTS dataset using your own recordings", elem_id="intro")
166
- gr.Markdown(
167
- "This demo allows you to create a text-to-speech dataset from an input audio snippet and push it to the hub to keep track of it."
168
- f" Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to automatically transcribe audio files"
169
- " of arbitrary length. It then merges chunks of audio and pushes them to the hub."
170
- )
171
  with gr.Row():
172
  with gr.Column():
173
  audio_file = gr.Audio(type="filepath")
174
  task_file = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
175
- cleaning_file = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not clean (background noise and music))", value="separate-audio")
176
- textbox_file = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format: <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
177
 
178
  with gr.Row():
179
  clear_file = gr.ClearButton([audio_file, task_file, cleaning_file, textbox_file])
@@ -183,7 +304,9 @@ with gr.Blocks(css=css) as demo:
183
  dataset_file = gr.Dataset(label="Transcribed samples.", components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
184
  transcript_file = gr.Textbox(label="Transcription")
185
 
 
 
186
  submit_file.click(transcribe, inputs=[audio_file, task_file, cleaning_file, textbox_file], outputs=[dataset_file, transcript_file])
187
- submit_youtube.click(transcribe, inputs=[audio_youtube, task_youtube, cleaning_youtube, textbox_youtube], outputs=[dataset_youtube, transcript_youtube])
188
 
189
- demo.launch(debug=True)
 
 
1
  import torch
2
+
3
+ import gradio as gr
4
  import yt_dlp as youtube_dl
5
  import numpy as np
6
  from datasets import Dataset, Audio
7
  from scipy.io import wavfile
8
+
9
  from transformers import pipeline
10
+ from transformers.pipelines.audio_utils import ffmpeg_read
11
+
12
  import tempfile
13
+ import os
14
  import time
15
+ import demucs.api
 
 
 
 
16
 
17
+ MODEL_NAME = "openai/whisper-large-v3" # "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram" #
 
 
 
 
 
 
 
 
18
  DEMUCS_MODEL_NAME = "htdemucs_ft"
19
  BATCH_SIZE = 8
20
  FILE_LIMIT_MB = 1000
 
29
  device=device,
30
  )
31
 
32
+ separator = demucs.api.Separator(model = DEMUCS_MODEL_NAME, )
33
 
34
  def separate_vocal(path):
35
  origin, separated = separator.separate_audio_file(path)
36
  demucs.api.save_audio(separated["vocals"], path, samplerate=separator.samplerate)
37
  return path
38
 
39
+
40
+ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
41
+ if inputs_path is None:
42
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
43
+ if dataset_name is None:
44
+ raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
45
+
46
+ if oauth_token is None:
47
+ gr.Warning("Make sure to click and login before using this demo.")
48
+ return [["transcripts will appear here"]], ""
49
+
50
+ total_step = 4
51
+ current_step = 0
52
+
53
+ current_step += 1
54
+ progress((current_step, total_step), desc="Transcribe using Whisper.")
55
+
56
+ sampling_rate, inputs = wavfile.read(inputs_path)
57
+
58
+ out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
59
+
60
+ text = out["text"]
61
+
62
+ current_step += 1
63
+ progress((current_step, total_step), desc="Merge chunks.")
64
+ chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
65
+
66
+ current_step += 1
67
+ progress((current_step, total_step), desc="Create dataset.")
68
+
69
+
70
+ transcripts = []
71
+ audios = []
72
+ with tempfile.TemporaryDirectory() as tmpdirname:
73
+ for i,chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for)")):
74
+
75
+ # TODO: make sure 1D or 2D?
76
+ arr = chunk["audio"]
77
+ path = os.path.join(tmpdirname, f"{i}.wav")
78
+ wavfile.write(path, sampling_rate, arr)
79
+
80
+ if use_demucs == "separate-audio":
81
+ # use demucs tp separate vocals
82
+ print(f"Separating vocals #{i}")
83
+ path = separate_vocal(path)
84
+
85
+ audios.append(path)
86
+ transcripts.append(chunk["text"])
87
+
88
+ dataset = Dataset.from_dict({"audio": audios, "text": transcripts}).cast_column("audio", Audio())
89
+
90
+ current_step += 1
91
+ progress((current_step, total_step), desc="Push dataset.")
92
+ dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
93
+
94
+ return [[transcript] for transcript in transcripts], text
95
+
96
+
97
  def _return_yt_html_embed(yt_url):
98
  video_id = yt_url.split("?v=")[-1]
99
  HTML_str = (
100
  f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
101
  " </center>"
102
  )
103
+ return HTML_str
104
 
105
  def download_yt_audio(yt_url, filename):
106
  info_loader = youtube_dl.YoutubeDL()
 
121
  file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
122
 
123
  if file_length_s > YT_LENGTH_LIMIT_S:
124
+ yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
125
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
126
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
127
 
128
  ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
 
133
  except youtube_dl.utils.ExtractorError as err:
134
  raise gr.Error(str(err))
135
 
 
 
 
 
 
136
 
137
+ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000,
138
+ progress=gr.Progress()):
 
139
 
140
+ if yt_url is None:
141
+ raise gr.Error("No youtube link submitted! Please put a working link.")
142
+ if dataset_name is None:
143
+ raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
144
+
145
+ total_step = 5
146
  current_step = 0
147
 
148
+ html_embed_str = _return_yt_html_embed(yt_url)
149
+
150
+ if oauth_token is None:
151
+ gr.Warning("Make sure to click and login before using this demo.")
152
+ return html_embed_str, [["transcripts will appear here"]], ""
153
+
154
  current_step += 1
155
+ progress((current_step, total_step), desc="Load video.")
156
 
157
+ with tempfile.TemporaryDirectory() as tmpdirname:
158
+ filepath = os.path.join(tmpdirname, "video.mp4")
159
+
160
+ download_yt_audio(yt_url, filepath)
161
+ with open(filepath, "rb") as f:
162
+ inputs_path = f.read()
163
 
164
+ inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
165
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
166
 
167
+ current_step += 1
168
+ progress((current_step, total_step), desc="Transcribe using Whisper.")
169
+ out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
170
+
171
+ text = out["text"]
172
+
173
+ inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
174
 
175
  current_step += 1
176
  progress((current_step, total_step), desc="Merge chunks.")
177
+ chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
178
 
179
  current_step += 1
180
  progress((current_step, total_step), desc="Create dataset.")
 
182
  transcripts = []
183
  audios = []
184
  with tempfile.TemporaryDirectory() as tmpdirname:
185
+ for i,chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for).")):
186
+
187
+ # TODO: make sure 1D or 2D?
188
  arr = chunk["audio"]
189
  path = os.path.join(tmpdirname, f"{i}.wav")
190
+ wavfile.write(path, dataset_sampling_rate, arr)
191
 
192
  if use_demucs == "separate-audio":
193
+ # use demucs tp separate vocals
194
+ print(f"Separating vocals #{i}")
195
  path = separate_vocal(path)
196
 
197
  audios.append(path)
 
203
  progress((current_step, total_step), desc="Push dataset.")
204
  dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
205
 
 
206
 
207
+ return html_embed_str, [[transcript] for transcript in transcripts], text
208
+
209
+
210
+ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
211
+ # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
212
+ # return list of dictionnaries (text, audio)
213
+ # min duration is in seconds
214
+ min_duration = int(min_duration * sampling_rate)
215
+
216
+
217
+ new_chunks = []
218
+ while chunks:
219
+ current_chunk = chunks.pop(0)
220
+
221
+ begin, end = current_chunk["timestamp"]
222
+ begin, end = int(begin*sampling_rate), int(end*sampling_rate)
223
+
224
+ current_dur = end-begin
225
+
226
+ text = current_chunk["text"]
227
+
228
+
229
+ chunk_to_concat = [audio_array[begin:end]]
230
+ while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
231
+ ch = chunks.pop(0)
232
+ begin, end = ch["timestamp"]
233
+ begin, end = int(begin*sampling_rate), int(end*sampling_rate)
234
+ current_dur += end-begin
235
+
236
+ text = "".join([text, ch["text"]])
237
+
238
+ # TODO: add silence ?
239
+ chunk_to_concat.append(audio_array[begin:end])
240
+
241
+
242
+ new_chunks.append({
243
+ "text": text.strip(),
244
+ "audio": np.concatenate(chunk_to_concat),
245
+ })
246
+ print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
247
+
248
+ return new_chunks
249
+
250
+ css = """
251
+ #intro{
252
+ max-width: 100%;
253
+ text-align: center;
254
+ margin: 0 auto;
255
+ }
256
+ """
257
  with gr.Blocks(css=css) as demo:
258
  with gr.Row():
259
  gr.LoginButton()
260
  gr.LogoutButton()
261
 
262
  with gr.Tab("YouTube"):
263
+ gr.Markdown("Create your own TTS dataset using Youtube", elem_id="intro")
264
+ gr.Markdown(
265
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
266
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to automatically transcribe audio files"
267
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
268
+ )
269
  with gr.Row():
270
  with gr.Column():
271
  audio_youtube = gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
272
  task_youtube = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
273
+ cleaning_youtube = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
274
+ textbox_youtube = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
275
 
276
  with gr.Row():
277
  clear_youtube = gr.ClearButton([audio_youtube, task_youtube, cleaning_youtube, textbox_youtube])
 
279
 
280
  with gr.Column():
281
  html_youtube = gr.HTML()
282
+ dataset_youtube = gr.Dataset(label="Transcribed samples.",components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
283
  transcript_youtube = gr.Textbox(label="Transcription")
284
 
285
  with gr.Tab("Microphone or Audio file"):
286
  gr.Markdown("Create your own TTS dataset using your own recordings", elem_id="intro")
287
+ gr.Markdown(
288
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
289
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to automatically transcribe audio files"
290
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
291
+ )
292
  with gr.Row():
293
  with gr.Column():
294
  audio_file = gr.Audio(type="filepath")
295
  task_file = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
296
+ cleaning_file = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
297
+ textbox_file = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
298
 
299
  with gr.Row():
300
  clear_file = gr.ClearButton([audio_file, task_file, cleaning_file, textbox_file])
 
304
  dataset_file = gr.Dataset(label="Transcribed samples.", components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
305
  transcript_file = gr.Textbox(label="Transcription")
306
 
307
+
308
+
309
  submit_file.click(transcribe, inputs=[audio_file, task_file, cleaning_file, textbox_file], outputs=[dataset_file, transcript_file])
310
+ submit_youtube.click(yt_transcribe, inputs=[audio_youtube, task_youtube, cleaning_youtube, textbox_youtube], outputs=[html_youtube, dataset_youtube, transcript_youtube])
311
 
312
+ demo.launch(debug=True)