marccgrau commited on
Commit
f978308
1 Parent(s): be3d36c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +569 -0
app.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inspiration from https://huggingface.co/spaces/vumichien/whisper-speaker-diarization
2
+
3
+ import whisper
4
+ import datetime
5
+ import subprocess
6
+ import gradio as gr
7
+ from pathlib import Path
8
+ import pandas as pd
9
+ import re
10
+ import time
11
+ import os
12
+ import numpy as np
13
+ from sklearn.cluster import AgglomerativeClustering
14
+
15
+ from pytube import YouTube
16
+ import torch
17
+ import pyannote.audio
18
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
19
+ from pyannote.audio import Audio
20
+ from pyannote.core import Segment
21
+
22
+ from gpuinfo import GPUInfo
23
+
24
+ import wave
25
+ import contextlib
26
+ from transformers import pipeline
27
+ import psutil
28
+
29
+ from zipfile import ZipFile
30
+ from io import StringIO
31
+ import csv
32
+
33
+ # ---- Model Loading ----
34
+
35
+ whisper_models = ["base", "small", "medium", "large"]
36
+ source_languages = {
37
+ "en": "English",
38
+ "de": "German",
39
+ "es": "Spanish",
40
+ "fr": "French",
41
+ }
42
+
43
+ source_language_list = [key[0] for key in source_languages.items()]
44
+
45
+ MODEL_NAME = "openai/whisper-small"
46
+ lang = "en"
47
+
48
+ device = "cuda" if torch.cuda.is_available() else "cpu"
49
+ pipe = pipeline(
50
+ task="automatic-speech-recognition",
51
+ model=MODEL_NAME,
52
+ chunk_length_s=30,
53
+ device=device,
54
+ )
55
+
56
+ pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
57
+
58
+ embedding_model = PretrainedSpeakerEmbedding(
59
+ "speechbrain/spkrec-ecapa-voxceleb",
60
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
61
+
62
+ # ---- S2T & Speaker diarization ----
63
+
64
+ def transcribe(microphone, file_upload):
65
+ warn_output = ""
66
+ if (microphone is not None) and (file_upload is not None):
67
+ warn_output = (
68
+ "WARNING: You've uploaded an audio file and used the microphone. "
69
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
70
+ )
71
+
72
+ elif (microphone is None) and (file_upload is None):
73
+ return "ERROR: You have to either use the microphone or upload an audio file"
74
+
75
+ file = microphone if microphone is not None else file_upload
76
+
77
+ text = pipe(file)["text"]
78
+
79
+ return warn_output + text
80
+
81
+
82
+ def convert_time(secs):
83
+ return datetime.timedelta(seconds=round(secs))
84
+
85
+ def convert_to_wav(filepath):
86
+ _,file_ending = os.path.splitext(f'{filepath}')
87
+ audio_file = filepath.replace(file_ending, ".wav")
88
+ print("starting conversion to wav")
89
+ os.system(f'ffmpeg -i "{filepath}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
90
+ return audio_file
91
+
92
+
93
+ def speech_to_text(microphone, file_upload, selected_source_lang, whisper_model, num_speakers):
94
+ """
95
+ # Transcribe audio file and separate into segment, assign speakers to segments
96
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
97
+ 2. Generating speaker embeddings for each segments.
98
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
99
+
100
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
101
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
102
+ """
103
+
104
+ model = whisper.load_model(whisper_model)
105
+ time_start = time.time()
106
+
107
+ try:
108
+ # Read and convert audio file
109
+ warn_output = ""
110
+ if (microphone is not None) and (file_upload is not None):
111
+ warn_output = (
112
+ "WARNING: You've uploaded an audio file and used the microphone. "
113
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
114
+ )
115
+
116
+ elif (microphone is None) and (file_upload is None):
117
+ return "ERROR: You have to either use the microphone or upload an audio file"
118
+
119
+ file = microphone if microphone is not None else file_upload
120
+
121
+ if microphone is None and file_upload is not None:
122
+ file = convert_to_wav(file)
123
+
124
+ # Get duration
125
+ with contextlib.closing(wave.open(file,'r')) as f:
126
+ frames = f.getnframes()
127
+ rate = f.getframerate()
128
+ duration = frames / float(rate)
129
+ print(f"conversion to wav ready, duration of audio file: {duration}")
130
+
131
+ # Transcribe audio
132
+ options = dict(language=selected_source_lang, beam_size=3, best_of=3)
133
+ transcribe_options = dict(task="transcribe", **options)
134
+ result = model.transcribe(file, **transcribe_options)
135
+ segments = result["segments"]
136
+ print("whisper done with transcription")
137
+ except Exception as e:
138
+ raise RuntimeError("Error converting audio file")
139
+
140
+ try:
141
+ # Create embedding
142
+ def segment_embedding(segment):
143
+ audio = Audio()
144
+ start = segment["start"]
145
+ # Whisper overshoots the end timestamp in the last segment
146
+ end = min(duration, segment["end"])
147
+ clip = Segment(start, end)
148
+ waveform, sample_rate = audio.crop(file, clip)
149
+ return embedding_model(waveform[None])
150
+
151
+ embeddings = np.zeros(shape=(len(segments), 192))
152
+ for i, segment in enumerate(segments):
153
+ embeddings[i] = segment_embedding(segment)
154
+ embeddings = np.nan_to_num(embeddings)
155
+ print(f'Embedding shape: {embeddings.shape}')
156
+
157
+ # Assign speaker label
158
+ if num_speakers == 1:
159
+ for i in range(len(segments)):
160
+ segments[i]["speaker"] = 'SPEAKER 1'
161
+ else:
162
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
163
+ labels = clustering.labels_
164
+ for i in range(len(segments)):
165
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
166
+
167
+ # Make output
168
+ objects = {
169
+ 'Start' : [],
170
+ 'End': [],
171
+ 'Speaker': [],
172
+ 'Text': []
173
+ }
174
+ text = ''
175
+ if num_speakers == 1:
176
+ objects['Start'].append(str(convert_time(segment["start"])))
177
+ objects['Speaker'].append(segment["speaker"])
178
+ for (i, segment) in enumerate(segments):
179
+ text += segment["text"] + ' '
180
+ objects['Text'].append(text)
181
+ objects['End'].append(str(convert_time(segment["end"])))
182
+ else:
183
+ for (i, segment) in enumerate(segments):
184
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
185
+ objects['Start'].append(str(convert_time(segment["start"])))
186
+ objects['Speaker'].append(segment["speaker"])
187
+ if i != 0:
188
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
189
+ objects['Text'].append(text)
190
+ text = ''
191
+ text += segment["text"] + ' '
192
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
193
+ objects['Text'].append(text)
194
+
195
+ time_end = time.time()
196
+ time_diff = time_end - time_start
197
+ memory = psutil.virtual_memory()
198
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
199
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
200
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
201
+ system_info = f"""
202
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
203
+ *Processing time: {time_diff:.5} seconds.*
204
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
205
+ """
206
+
207
+ return pd.DataFrame(objects), system_info
208
+
209
+ except Exception as e:
210
+ raise RuntimeError("Error Running inference with local model", e)
211
+
212
+ # ---- Youtube Conversion ----
213
+
214
+ def get_youtube(video_url):
215
+ yt = YouTube(video_url)
216
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
217
+ print("Success download video")
218
+ print(abs_video_path)
219
+ return abs_video_path
220
+
221
+
222
+
223
+ def yt_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
224
+ """
225
+ # Transcribe youtube link using OpenAI Whisper
226
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
227
+ 2. Generating speaker embeddings for each segments.
228
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
229
+
230
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
231
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
232
+ """
233
+
234
+ model = whisper.load_model(whisper_model)
235
+ time_start = time.time()
236
+ if(video_file_path == None):
237
+ raise ValueError("Error no video input")
238
+ print(video_file_path)
239
+
240
+ try:
241
+ # Read and convert youtube video
242
+ _,file_ending = os.path.splitext(f'{video_file_path}')
243
+ print(f'file ending is {file_ending}')
244
+ audio_file = video_file_path.replace(file_ending, ".wav")
245
+ print("starting conversion to wav")
246
+ os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
247
+
248
+ # Get duration
249
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
250
+ frames = f.getnframes()
251
+ rate = f.getframerate()
252
+ duration = frames / float(rate)
253
+ print(f"conversion to wav ready, duration of audio file: {duration}")
254
+
255
+ # Transcribe audio
256
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5)
257
+ transcribe_options = dict(task="transcribe", **options)
258
+ result = model.transcribe(audio_file, **transcribe_options)
259
+ segments = result["segments"]
260
+ print("starting whisper done with whisper")
261
+ except Exception as e:
262
+ raise RuntimeError("Error converting video to audio")
263
+
264
+ try:
265
+ # Create embedding
266
+ def segment_embedding(segment):
267
+ audio = Audio()
268
+ start = segment["start"]
269
+ # Whisper overshoots the end timestamp in the last segment
270
+ end = min(duration, segment["end"])
271
+ clip = Segment(start, end)
272
+ waveform, sample_rate = audio.crop(audio_file, clip)
273
+ return embedding_model(waveform[None])
274
+
275
+ embeddings = np.zeros(shape=(len(segments), 192))
276
+ for i, segment in enumerate(segments):
277
+ embeddings[i] = segment_embedding(segment)
278
+ embeddings = np.nan_to_num(embeddings)
279
+ print(f'Embedding shape: {embeddings.shape}')
280
+
281
+ # Assign speaker label
282
+ if num_speakers == 1:
283
+ for i in range(len(segments)):
284
+ segments[i]["speaker"] = 'SPEAKER 1'
285
+ else:
286
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
287
+ labels = clustering.labels_
288
+ for i in range(len(segments)):
289
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
290
+
291
+ # Make output
292
+ objects = {
293
+ 'Start' : [],
294
+ 'End': [],
295
+ 'Speaker': [],
296
+ 'Text': []
297
+ }
298
+ text = ''
299
+ if num_speakers == 1:
300
+ objects['Start'].append(str(convert_time(segment["start"])))
301
+ objects['Speaker'].append(segment["speaker"])
302
+ for (i, segment) in enumerate(segments):
303
+ text += segment["text"] + ' '
304
+ objects['Text'].append(text)
305
+ objects['End'].append(str(convert_time(segment["end"])))
306
+ else:
307
+ for (i, segment) in enumerate(segments):
308
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
309
+ objects['Start'].append(str(convert_time(segment["start"])))
310
+ objects['Speaker'].append(segment["speaker"])
311
+ if i != 0:
312
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
313
+ objects['Text'].append(text)
314
+ text = ''
315
+ text += segment["text"] + ' '
316
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
317
+ objects['Text'].append(text)
318
+
319
+ time_end = time.time()
320
+ time_diff = time_end - time_start
321
+ memory = psutil.virtual_memory()
322
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
323
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
324
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
325
+ system_info = f"""
326
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
327
+ *Processing time: {time_diff:.5} seconds.*
328
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
329
+ """
330
+
331
+ return pd.DataFrame(objects), system_info
332
+
333
+ except Exception as e:
334
+ raise RuntimeError("Error Running inference with local model", e)
335
+
336
+ def download_csv(dataframe: pd.DataFrame):
337
+ compression_options = dict(method='zip', archive_name='output.csv')
338
+ dataframe.to_csv('output.zip', index=False, compression=compression_options)
339
+ return 'output.zip'
340
+
341
+ # ---- Gradio Layout ----
342
+ # Inspiration from https://huggingface.co/spaces/vumichien/whisper-speaker-diarization
343
+
344
+ # -- General Functions --
345
+ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
346
+ memory = psutil.virtual_memory()
347
+ title = "Whisper speaker diarization & speech recognition"
348
+ interface = gr.Blocks(title=title)
349
+ interface.encrypt = False
350
+
351
+ # -- Functions Audio Input --
352
+ microphone_in = gr.inputs.Audio(source="microphone",
353
+ type="filepath",
354
+ optional=True)
355
+
356
+ upload_in = gr.inputs.Audio(source="upload",
357
+ type="filepath",
358
+ optional=True)
359
+
360
+ selected_source_lang_audio = gr.Dropdown(choices=source_language_list,
361
+ type="value",
362
+ value="en",
363
+ label="Spoken language in audio",
364
+ interactive=True)
365
+
366
+ selected_whisper_model_audio = gr.Dropdown(choices=whisper_models,
367
+ type="value",
368
+ value="base",
369
+ label="Selected Whisper model",
370
+ interactive=True)
371
+
372
+ number_speakers_audio = gr.Number(precision=0,
373
+ value=2,
374
+ label="Selected number of speakers",
375
+ interactive=True)
376
+
377
+ system_info_audio = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
378
+
379
+ transcription_df_audio = gr.DataFrame(value=df_init,
380
+ label="Transcription dataframe",
381
+ row_count=(0, "dynamic"),
382
+ max_rows = 10,
383
+ wrap=True,
384
+ overflow_row_behaviour='paginate')
385
+
386
+ csv_download_audio = gr.outputs.File(label="Download CSV")
387
+
388
+ # -- Functions Video Input --
389
+ video_in = gr.Video(label="Video file",
390
+ mirror_webcam=False)
391
+
392
+ youtube_url_in = gr.Textbox(label="Youtube url",
393
+ lines=1,
394
+ interactive=True)
395
+
396
+ selected_source_lang_yt = gr.Dropdown(choices=source_language_list,
397
+ type="value",
398
+ value="en",
399
+ label="Spoken language in audio",
400
+ interactive=True)
401
+
402
+ selected_whisper_model_yt = gr.Dropdown(choices=whisper_models,
403
+ type="value",
404
+ value="base",
405
+ label="Selected Whisper model",
406
+ interactive=True)
407
+
408
+ number_speakers_yt = gr.Number(precision=0,
409
+ value=2,
410
+ label="Selected number of speakers",
411
+ interactive=True)
412
+
413
+ system_info_yt = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
414
+
415
+ transcription_df_yt = gr.DataFrame(value=df_init,
416
+ label="Transcription dataframe",
417
+ row_count=(0, "dynamic"),
418
+ max_rows = 10,
419
+ wrap=True,
420
+ overflow_row_behaviour='paginate')
421
+
422
+ csv_download_yt = gr.outputs.File(label="Download CSV")
423
+
424
+ with interface:
425
+ with gr.Tab("Whisper speaker diarization & speech recognition"):
426
+ gr.Markdown('''
427
+ <div>
428
+ <h1 style='text-align: center'>Whisper speaker diarization & speech recognition</h1>
429
+ This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
430
+ </div>
431
+ ''')
432
+
433
+ with gr.Row():
434
+ gr.Markdown('''
435
+ ### Transcribe youtube link using OpenAI Whisper
436
+ ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
437
+ ##### 2. Generating speaker embeddings for each segments.
438
+ ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
439
+ ''')
440
+
441
+ with gr.Row():
442
+ with gr.Column():
443
+ microphone_in.render()
444
+ upload_in.render()
445
+ with gr.Column():
446
+ gr.Markdown('''
447
+ ##### Here you can start the transcription process.
448
+ ##### Please select the source language for transcription.
449
+ ##### You should select a number of speakers for getting better results.
450
+ ''')
451
+ selected_source_lang_audio.render()
452
+ selected_whisper_model_audio.render()
453
+ number_speakers_audio.render()
454
+ transcribe_btn = gr.Button("Transcribe audio and initiate diarization")
455
+ transcribe_btn.click(speech_to_text,
456
+ [
457
+ microphone_in,
458
+ upload_in,
459
+ selected_source_lang_audio,
460
+ selected_whisper_model_audio,
461
+ number_speakers_audio
462
+ ],
463
+ [
464
+ transcription_df_audio,
465
+ system_info_audio
466
+ ])
467
+
468
+
469
+ with gr.Row():
470
+ gr.Markdown('''
471
+ ##### Here you will get transcription output
472
+ ##### ''')
473
+
474
+
475
+ with gr.Row():
476
+ with gr.Column():
477
+ transcription_df_audio.render()
478
+ system_info_audio.render()
479
+
480
+ with gr.Row():
481
+ with gr.Column():
482
+ download_btn = gr.Button("Download transcription dataframe")
483
+ download_btn.click(download_csv, transcription_df_audio, csv_download_audio)
484
+ csv_download_audio.render()
485
+
486
+ with gr.Row():
487
+ gr.Markdown('''Chair of Data Science and Natural Language Processing - University of St. Gallen''')
488
+
489
+ with gr.Tab("Youtube Speech to Text"):
490
+ with gr.Row():
491
+ gr.Markdown('''
492
+ <div>
493
+ <h1 style='text-align: center'>Youtube Speech Recognition & Speaker Diarization</h1>
494
+ </div>
495
+ ''')
496
+
497
+ with gr.Row():
498
+ gr.Markdown('''
499
+ ### Transcribe Youtube link
500
+ #### Test with the following examples:
501
+ ''')
502
+ examples = gr.Examples(examples =
503
+ [
504
+ "https://www.youtube.com/watch?v=vnc-Q8V4ihQ",
505
+ "https://www.youtube.com/watch?v=_B60aTHCE5E",
506
+ "https://www.youtube.com/watch?v=4BdKZxD-ziA",
507
+ "https://www.youtube.com/watch?v=4ezBjAW26Js",
508
+ ],
509
+ label="Examples UNISG",
510
+ inputs=[youtube_url_in])
511
+
512
+ with gr.Row():
513
+ with gr.Column():
514
+ youtube_url_in.render()
515
+ download_youtube_btn = gr.Button("Download Youtube video")
516
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [video_in])
517
+ print(video_in)
518
+
519
+ with gr.Row():
520
+ with gr.Column():
521
+ video_in.render()
522
+ with gr.Column():
523
+ gr.Markdown('''
524
+ #### Start the transcription process.
525
+ #### To initiate, please select the source language for transcription.
526
+ #### For better performance select the number of speakers.
527
+ ''')
528
+ selected_source_lang_yt.render()
529
+ selected_whisper_model_yt.render()
530
+ number_speakers_yt.render()
531
+ transcribe_btn = gr.Button("Transcribe audio and initiate diarization")
532
+ transcribe_btn.click(yt_to_text,
533
+ [
534
+ video_in,
535
+ selected_source_lang_yt,
536
+ selected_whisper_model_yt,
537
+ number_speakers_yt
538
+ ],
539
+ [
540
+ transcription_df_yt,
541
+ system_info_yt
542
+ ])
543
+
544
+ with gr.Row():
545
+ gr.Markdown('''
546
+ #### Here you will get transcription output
547
+ #### ''')
548
+
549
+ with gr.Row():
550
+ with gr.Column():
551
+ transcription_df_yt.render()
552
+ system_info_yt.render()
553
+
554
+ with gr.Row():
555
+ with gr.Column():
556
+ download_btn = gr.Button("Download transcription dataframe")
557
+ download_btn.click(download_csv, transcription_df_audio, csv_download_yt)
558
+ csv_download_yt.render()
559
+
560
+ with gr.Row():
561
+ gr.Markdown('''Chair of Data Science and Natural Language Processing - University of St. Gallen''')
562
+
563
+
564
+ def main():
565
+ interface.launch()
566
+
567
+
568
+ if __name__ == "__main__":
569
+ main()