Maximofn commited on
Commit
9b5b5b7
1 Parent(s): d73543f

Implement speaker diarization and transcription merging pipeline

Browse files

- Add diarization functionality with configurable speaker count
- Create merge_transcription_and_diarization function to assign speakers to transcribed text
- Enhance app.py with debug mode and dynamic file loading
- Update UI to support advanced speaker configuration
- Modify logo display and add visibility controls for intermediate outputs

Files changed (1) hide show
  1. app.py +197 -17
app.py CHANGED
@@ -19,12 +19,14 @@ from ui_config import (
19
  ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
20
  BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
21
  BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
22
- html_social_media, get_html_subtify_logo, html_buy_me_a_coffe
23
  )
24
  # from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
25
  from slice_audio import slice_audio as slice_audio_main
26
  from audio import get_audio_from_video
27
  from transcribe import transcribe, get_language_dict
 
 
28
 
29
  NUMBER = 100
30
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -56,13 +58,17 @@ YOUTUBE = "youtube"
56
  TWITCH = "twitch"
57
  ERROR = "error"
58
 
 
 
 
59
  subtify_logo = Image.open("assets/subtify_logo-scaled.png")
60
  subtify_logo_width, subtify_logo_height = subtify_logo.size
61
  factor = 4
62
  new_width = subtify_logo_width // factor
63
  new_height = subtify_logo_height // factor
64
 
65
- html_subtify_logo = get_html_subtify_logo(new_width, new_height)
 
66
 
67
  language_dict = union_language_dict()
68
 
@@ -118,8 +124,10 @@ def change_visibility_texboxes():
118
  return (
119
  gr.update(value="Done"), # auxiliar_block1
120
  gr.update(visible=True), # get_audio_from_video_info
 
121
  gr.update(visible=True), # video_sliced_progress_info
122
  gr.update(visible=True), # video_transcribed_progress_info
 
123
  gr.update(visible=True), # transcriptions_concatenated_progress_info
124
  gr.update(visible=True), # video_translated_progress_info
125
  gr.update(visible=True), # video_subtitled_progress_info
@@ -128,8 +136,17 @@ def change_visibility_texboxes():
128
  def get_audio(video_path):
129
  print('*'*NUMBER)
130
  print(f"Getting audio from video {video_path}")
131
-
132
  audios_folder = "audios"
 
 
 
 
 
 
 
 
 
133
  try:
134
  audio_path = get_audio_from_video(video_path, audios_folder)
135
  return [
@@ -162,18 +179,64 @@ def slice_audio(input_audio_path):
162
  gr.update(value="Ok"), # video_sliced_progress_info
163
  )
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  def trascribe_audio(input_audio_path, source_languaje):
166
  print('*'*NUMBER)
167
  print(f"Transcript {input_audio_path}")
168
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  # Get language dict
170
  language_dict = get_language_dict()
171
 
172
  # Transcribe audio file
173
- transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
 
 
 
 
 
174
 
175
  return (
176
- gr.Textbox(value="Ok")
 
177
  )
178
 
179
  def concatenate_transcriptions():
@@ -276,6 +339,16 @@ def process_uploaded_video(video_path):
276
  videos_folder = "videos"
277
  if not os.path.exists(videos_folder):
278
  os.makedirs(videos_folder)
 
 
 
 
 
 
 
 
 
 
279
 
280
  # Copy uploaded video to videos folder
281
  new_video_path = os.path.join(videos_folder, "download_video.mp4")
@@ -285,7 +358,103 @@ def process_uploaded_video(video_path):
285
  return [
286
  gr.update(label="Video uploaded"), # video_input
287
  gr.update(visible=True), # config_block
288
- gr.update(value=new_video_path) # original_video_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  ]
290
 
291
  @spaces.GPU
@@ -336,7 +505,7 @@ def subtify():
336
  # Layout
337
  gr.HTML(html_social_media)
338
  gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
339
- gr.HTML(html_subtify_logo)
340
 
341
  # Input block, where the user can upload a video and configure the subtify process
342
  visible = False
@@ -357,21 +526,27 @@ def subtify():
357
  with gr.Row():
358
  source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
359
  target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Espa帽ol", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
360
- with gr.Accordion("Advanced settings", open=False, visible=True) as Advanced_setings:
361
- number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
 
 
362
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
363
 
364
  auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
365
  with gr.Row():
366
  get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
367
  video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
 
 
368
  transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
369
  video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
370
  video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
371
 
372
  original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
373
  original_video_path = gr.Textbox(label="Original video path", visible=visible)
374
- original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=visible)
 
 
375
  original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
376
  subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
377
  auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
@@ -400,11 +575,11 @@ def subtify():
400
  video_input.change(
401
  fn=process_uploaded_video,
402
  inputs=[video_input],
403
- outputs=[video_input, config_block, original_video_path]
404
  )
405
  subtify_button.click(
406
  fn=change_visibility_texboxes,
407
- outputs=[auxiliar_block1, get_audio_from_video_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
408
  )
409
  auxiliar_block1.change(
410
  fn=get_audio,
@@ -414,12 +589,17 @@ def subtify():
414
  get_audio_from_video_info.change(
415
  fn=trascribe_audio,
416
  inputs=[original_audio_path, source_languaje],
417
- outputs=[video_transcribed_progress_info]
 
 
 
 
 
 
 
 
 
418
  )
419
- # video_transcribed_progress_info.change(
420
- # fn=concatenate_transcriptions,
421
- # outputs=[transcriptions_concatenated_progress_info, original_audio_transcribed_path]
422
- # )
423
  # transcriptions_concatenated_progress_info.change(
424
  # fn=translate_transcription,
425
  # inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
 
19
  ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
20
  BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
21
  BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
22
+ html_social_media, get_html_subtify_logo_big, get_html_subtify_logo_small, html_buy_me_a_coffe
23
  )
24
  # from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
25
  from slice_audio import slice_audio as slice_audio_main
26
  from audio import get_audio_from_video
27
  from transcribe import transcribe, get_language_dict
28
+ from diarize_library import diarize_audio
29
+ import json
30
 
31
  NUMBER = 100
32
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
58
  TWITCH = "twitch"
59
  ERROR = "error"
60
 
61
+ VIEW_OUTPUTS = True
62
+ DEBUG = True
63
+
64
  subtify_logo = Image.open("assets/subtify_logo-scaled.png")
65
  subtify_logo_width, subtify_logo_height = subtify_logo.size
66
  factor = 4
67
  new_width = subtify_logo_width // factor
68
  new_height = subtify_logo_height // factor
69
 
70
+ html_subtify_logo_big = get_html_subtify_logo_big(new_width, new_height)
71
+ html_subtify_logo_small = get_html_subtify_logo_small(new_width, new_height)
72
 
73
  language_dict = union_language_dict()
74
 
 
124
  return (
125
  gr.update(value="Done"), # auxiliar_block1
126
  gr.update(visible=True), # get_audio_from_video_info
127
+ gr.update(visible=True), # merged_transcription
128
  gr.update(visible=True), # video_sliced_progress_info
129
  gr.update(visible=True), # video_transcribed_progress_info
130
+ gr.update(visible=True), # diarization_progress_info
131
  gr.update(visible=True), # transcriptions_concatenated_progress_info
132
  gr.update(visible=True), # video_translated_progress_info
133
  gr.update(visible=True), # video_subtitled_progress_info
 
136
  def get_audio(video_path):
137
  print('*'*NUMBER)
138
  print(f"Getting audio from video {video_path}")
139
+
140
  audios_folder = "audios"
141
+
142
+ if DEBUG:
143
+ audio_file = f"{audios_folder}/download_audio.mp3"
144
+ if os.path.exists(audio_file):
145
+ return [
146
+ gr.update(value="Loaded"), # get_audio_from_video_info
147
+ gr.update(value=audio_file) # original_audio_path
148
+ ]
149
+
150
  try:
151
  audio_path = get_audio_from_video(video_path, audios_folder)
152
  return [
 
179
  gr.update(value="Ok"), # video_sliced_progress_info
180
  )
181
 
182
+ def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
183
+ print('*'*NUMBER)
184
+ print(f"Diarize {input_audio_path}")
185
+
186
+ # Diarization file
187
+ diarization_file = f"diarization/diarization.json"
188
+
189
+ if DEBUG:
190
+ if os.path.exists(diarization_file):
191
+ with open(diarization_file, "r") as f:
192
+ diarization = f.read()
193
+ return [
194
+ gr.update(value="Loaded"),
195
+ gr.update(value=diarization)
196
+ ]
197
+
198
+ # Diarize audio
199
+ diarization = diarize_audio(input_audio_path, num_speakers, min_speakers, max_speakers, DEVICE)
200
+
201
+ # Save diarization
202
+ with open(diarization_file, "w") as f:
203
+ json.dump(diarization, f)
204
+
205
+ return [
206
+ gr.update(value="Ok"),
207
+ gr.update(value=diarization)
208
+ ]
209
+
210
  def trascribe_audio(input_audio_path, source_languaje):
211
  print('*'*NUMBER)
212
  print(f"Transcript {input_audio_path}")
213
 
214
+ # Transcription file
215
+ transcription_file = f"transcriptions/transcription_{source_languaje}.json"
216
+
217
+ if DEBUG:
218
+ if os.path.exists(transcription_file):
219
+ transcription = open(transcription_file, "r").read()
220
+ transcription = json.loads(transcription)
221
+ return [
222
+ gr.update(value="Loaded"),
223
+ gr.update(value=transcription)
224
+ ]
225
+
226
  # Get language dict
227
  language_dict = get_language_dict()
228
 
229
  # Transcribe audio file
230
+ transcription_str, transcription_dict = transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
231
+
232
+ # Save transcription
233
+ with open(transcription_file, "w") as f:
234
+ transcription_json = json.dumps(transcription_dict)
235
+ f.write(transcription_json)
236
 
237
  return (
238
+ gr.update(value="Ok"),
239
+ gr.update(value=transcription_dict)
240
  )
241
 
242
  def concatenate_transcriptions():
 
339
  videos_folder = "videos"
340
  if not os.path.exists(videos_folder):
341
  os.makedirs(videos_folder)
342
+
343
+ if DEBUG:
344
+ video_file = f"{videos_folder}/download_video.mp4"
345
+ if os.path.exists(video_file):
346
+ return [
347
+ gr.update(label="Video uploaded"), # video_input
348
+ gr.update(visible=True), # config_block
349
+ gr.update(value=video_file), # original_video_path
350
+ gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
351
+ ]
352
 
353
  # Copy uploaded video to videos folder
354
  new_video_path = os.path.join(videos_folder, "download_video.mp4")
 
358
  return [
359
  gr.update(label="Video uploaded"), # video_input
360
  gr.update(visible=True), # config_block
361
+ gr.update(value=new_video_path), # original_video_path
362
+ gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
363
+ ]
364
+
365
+ def merge_transcription_and_diarization():
366
+ """
367
+ Combina la transcripci贸n y la diarizaci贸n para asignar speakers a cada palabra.
368
+
369
+ Returns:
370
+ dict: Transcripci贸n combinada con informaci贸n de speakers
371
+ """
372
+ print('*'*NUMBER)
373
+ print("Merge transcription and diarization")
374
+
375
+ if DEBUG:
376
+ merged_transcription_path = "merged_transcription_diarization/merged.json"
377
+ if os.path.exists(merged_transcription_path):
378
+ with open(merged_transcription_path, 'r') as f:
379
+ merged_transcription = json.load(f)
380
+ return [
381
+ gr.update(value="Loaded"),
382
+ gr.update(value=merged_transcription)
383
+ ]
384
+
385
+ transcription_path = "transcriptions/transcription_English.json"
386
+ diarization_path = "diarization/diarization.json"
387
+ # Cargar los archivos JSON
388
+ with open(transcription_path, 'r') as f:
389
+ transcription = json.load(f)
390
+ with open(diarization_path, 'r') as f:
391
+ diarization = json.load(f)
392
+
393
+ # Crear una nueva lista para los chunks combinados
394
+ merged_chunks = []
395
+
396
+ # Para cada palabra en la transcripci贸n
397
+ for chunk in transcription.get('chunks', []):
398
+ # Verificar que el chunk tiene timestamps v谩lidos
399
+ if not (isinstance(chunk.get('start'), (int, float)) and
400
+ isinstance(chunk.get('end'), (int, float))):
401
+ continue
402
+
403
+ word_start = float(chunk['start'])
404
+ word_end = float(chunk['end'])
405
+
406
+ # Encontrar el speaker correspondiente en la diarizaci贸n
407
+ speaker = None
408
+ for segment in diarization:
409
+ # Verificar que el segmento tiene timestamps v谩lidos
410
+ if not (isinstance(segment.get('start'), (int, float)) and
411
+ isinstance(segment.get('end'), (int, float))):
412
+ continue
413
+
414
+ segment_start = float(segment['start'])
415
+ segment_end = float(segment['end'])
416
+
417
+ # Si la palabra est谩 dentro del rango de tiempo del segmento
418
+ if (word_start >= segment_start and word_end <= segment_end):
419
+ speaker = segment['speaker']
420
+ break
421
+
422
+ # Si la palabra est谩 mayormente dentro del segmento (m谩s del 50% de su duraci贸n)
423
+ word_duration = word_end - word_start
424
+ overlap_start = max(word_start, segment_start)
425
+ overlap_end = min(word_end, segment_end)
426
+ overlap_duration = max(0, overlap_end - overlap_start)
427
+
428
+ if overlap_duration > word_duration * 0.5:
429
+ speaker = segment['speaker']
430
+ break
431
+
432
+ # Crear el nuevo chunk con la informaci贸n del speaker
433
+ merged_chunk = {
434
+ 'start': word_start,
435
+ 'end': word_end,
436
+ 'text': chunk['text'],
437
+ 'speaker': speaker if speaker else 'UNKNOWN'
438
+ }
439
+ merged_chunks.append(merged_chunk)
440
+
441
+ # Crear el diccionario final
442
+ merged_transcription = {
443
+ 'text': transcription.get('text', ''),
444
+ 'chunks': merged_chunks
445
+ }
446
+
447
+ # Crear el directorio si no existe
448
+ if not os.path.exists(merged_transcription_path):
449
+ os.makedirs(merged_transcription_path)
450
+
451
+ # Guardar el resultado en el nuevo directorio
452
+ with open(merged_transcription_path, 'w', encoding='utf-8') as f:
453
+ json.dump(merged_transcription, f, ensure_ascii=False, indent=2)
454
+
455
+ return [
456
+ gr.update(value="Ok"),
457
+ gr.update(value=merged_transcription)
458
  ]
459
 
460
  @spaces.GPU
 
505
  # Layout
506
  gr.HTML(html_social_media)
507
  gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
508
+ html_subtify_logo_component = gr.HTML(html_subtify_logo_big)
509
 
510
  # Input block, where the user can upload a video and configure the subtify process
511
  visible = False
 
526
  with gr.Row():
527
  source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
528
  target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Espa帽ol", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
529
+ with gr.Accordion("Advanced settings", open=True, visible=True) as Advanced_setings:
530
+ number_of_speakers = gr.Number(visible=True, label="Number of speakers", show_label=True, value=0, interactive=True, info="Number of speakers in the video, if you don't know, select 0")
531
+ min_speakers = gr.Number(visible=True, label="Min speakers", show_label=True, value=0, scale=0, interactive=True, info="Minimum number of speakers in the video")
532
+ max_speakers = gr.Number(visible=True, label="Max speakers", show_label=True, value=0, scale=0, interactive=True, info="Maximum number of speakers in the video")
533
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
534
 
535
  auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
536
  with gr.Row():
537
  get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
538
  video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
539
+ diarization_progress_info = gr.Textbox(placeholder="Waiting", label="Diarize progress info", elem_id="diarization_progress_info", interactive=False, visible=visible)
540
+ merged_transcription_progress_info = gr.Textbox(placeholder="Waiting", label="Merge transcription and diarization progress info", elem_id="merged_transcription_progress_info", interactive=False, visible=visible)
541
  transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
542
  video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
543
  video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
544
 
545
  original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
546
  original_video_path = gr.Textbox(label="Original video path", visible=visible)
547
+ transcription = gr.Textbox(label="transcription", elem_id="transcription", visible=VIEW_OUTPUTS)
548
+ diarization = gr.Textbox(label="diarization", elem_id="diarization", visible=VIEW_OUTPUTS)
549
+ merged_transcription = gr.Textbox(label="merged_transcription", elem_id="merged_transcription", visible=VIEW_OUTPUTS)
550
  original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
551
  subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
552
  auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
 
575
  video_input.change(
576
  fn=process_uploaded_video,
577
  inputs=[video_input],
578
+ outputs=[video_input, config_block, original_video_path, html_subtify_logo_component]
579
  )
580
  subtify_button.click(
581
  fn=change_visibility_texboxes,
582
+ outputs=[auxiliar_block1, get_audio_from_video_info, merged_transcription_progress_info, video_transcribed_progress_info, diarization_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
583
  )
584
  auxiliar_block1.change(
585
  fn=get_audio,
 
589
  get_audio_from_video_info.change(
590
  fn=trascribe_audio,
591
  inputs=[original_audio_path, source_languaje],
592
+ outputs=[video_transcribed_progress_info, transcription]
593
+ )
594
+ video_transcribed_progress_info.change(
595
+ fn=diarize,
596
+ inputs=[original_audio_path, number_of_speakers, min_speakers, max_speakers],
597
+ outputs=[diarization_progress_info, diarization]
598
+ )
599
+ diarization_progress_info.change(
600
+ fn=merge_transcription_and_diarization,
601
+ outputs=[merged_transcription_progress_info, merged_transcription]
602
  )
 
 
 
 
603
  # transcriptions_concatenated_progress_info.change(
604
  # fn=translate_transcription,
605
  # inputs=[original_audio_transcribed_path, source_languaje, target_languaje],