Implement speaker diarization and transcription merging pipeline
Browse files- Add diarization functionality with configurable speaker count
- Create merge_transcription_and_diarization function to assign speakers to transcribed text
- Enhance app.py with debug mode and dynamic file loading
- Update UI to support advanced speaker configuration
- Modify logo display and add visibility controls for intermediate outputs
app.py
CHANGED
@@ -19,12 +19,14 @@ from ui_config import (
|
|
19 |
ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
|
20 |
BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
|
21 |
BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
|
22 |
-
html_social_media,
|
23 |
)
|
24 |
# from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
|
25 |
from slice_audio import slice_audio as slice_audio_main
|
26 |
from audio import get_audio_from_video
|
27 |
from transcribe import transcribe, get_language_dict
|
|
|
|
|
28 |
|
29 |
NUMBER = 100
|
30 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
@@ -56,13 +58,17 @@ YOUTUBE = "youtube"
|
|
56 |
TWITCH = "twitch"
|
57 |
ERROR = "error"
|
58 |
|
|
|
|
|
|
|
59 |
subtify_logo = Image.open("assets/subtify_logo-scaled.png")
|
60 |
subtify_logo_width, subtify_logo_height = subtify_logo.size
|
61 |
factor = 4
|
62 |
new_width = subtify_logo_width // factor
|
63 |
new_height = subtify_logo_height // factor
|
64 |
|
65 |
-
|
|
|
66 |
|
67 |
language_dict = union_language_dict()
|
68 |
|
@@ -118,8 +124,10 @@ def change_visibility_texboxes():
|
|
118 |
return (
|
119 |
gr.update(value="Done"), # auxiliar_block1
|
120 |
gr.update(visible=True), # get_audio_from_video_info
|
|
|
121 |
gr.update(visible=True), # video_sliced_progress_info
|
122 |
gr.update(visible=True), # video_transcribed_progress_info
|
|
|
123 |
gr.update(visible=True), # transcriptions_concatenated_progress_info
|
124 |
gr.update(visible=True), # video_translated_progress_info
|
125 |
gr.update(visible=True), # video_subtitled_progress_info
|
@@ -128,8 +136,17 @@ def change_visibility_texboxes():
|
|
128 |
def get_audio(video_path):
|
129 |
print('*'*NUMBER)
|
130 |
print(f"Getting audio from video {video_path}")
|
131 |
-
|
132 |
audios_folder = "audios"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
try:
|
134 |
audio_path = get_audio_from_video(video_path, audios_folder)
|
135 |
return [
|
@@ -162,18 +179,64 @@ def slice_audio(input_audio_path):
|
|
162 |
gr.update(value="Ok"), # video_sliced_progress_info
|
163 |
)
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
def trascribe_audio(input_audio_path, source_languaje):
|
166 |
print('*'*NUMBER)
|
167 |
print(f"Transcript {input_audio_path}")
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
# Get language dict
|
170 |
language_dict = get_language_dict()
|
171 |
|
172 |
# Transcribe audio file
|
173 |
-
transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
return (
|
176 |
-
gr.
|
|
|
177 |
)
|
178 |
|
179 |
def concatenate_transcriptions():
|
@@ -276,6 +339,16 @@ def process_uploaded_video(video_path):
|
|
276 |
videos_folder = "videos"
|
277 |
if not os.path.exists(videos_folder):
|
278 |
os.makedirs(videos_folder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
# Copy uploaded video to videos folder
|
281 |
new_video_path = os.path.join(videos_folder, "download_video.mp4")
|
@@ -285,7 +358,103 @@ def process_uploaded_video(video_path):
|
|
285 |
return [
|
286 |
gr.update(label="Video uploaded"), # video_input
|
287 |
gr.update(visible=True), # config_block
|
288 |
-
gr.update(value=new_video_path) # original_video_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
]
|
290 |
|
291 |
@spaces.GPU
|
@@ -336,7 +505,7 @@ def subtify():
|
|
336 |
# Layout
|
337 |
gr.HTML(html_social_media)
|
338 |
gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
|
339 |
-
gr.HTML(
|
340 |
|
341 |
# Input block, where the user can upload a video and configure the subtify process
|
342 |
visible = False
|
@@ -357,21 +526,27 @@ def subtify():
|
|
357 |
with gr.Row():
|
358 |
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
|
359 |
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Espa帽ol", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
|
360 |
-
with gr.Accordion("Advanced settings", open=
|
361 |
-
number_of_speakers = gr.
|
|
|
|
|
362 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
363 |
|
364 |
auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
|
365 |
with gr.Row():
|
366 |
get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
|
367 |
video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
|
|
|
|
|
368 |
transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
|
369 |
video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
|
370 |
video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
|
371 |
|
372 |
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
|
373 |
original_video_path = gr.Textbox(label="Original video path", visible=visible)
|
374 |
-
|
|
|
|
|
375 |
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
|
376 |
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
|
377 |
auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
|
@@ -400,11 +575,11 @@ def subtify():
|
|
400 |
video_input.change(
|
401 |
fn=process_uploaded_video,
|
402 |
inputs=[video_input],
|
403 |
-
outputs=[video_input, config_block, original_video_path]
|
404 |
)
|
405 |
subtify_button.click(
|
406 |
fn=change_visibility_texboxes,
|
407 |
-
outputs=[auxiliar_block1, get_audio_from_video_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
|
408 |
)
|
409 |
auxiliar_block1.change(
|
410 |
fn=get_audio,
|
@@ -414,12 +589,17 @@ def subtify():
|
|
414 |
get_audio_from_video_info.change(
|
415 |
fn=trascribe_audio,
|
416 |
inputs=[original_audio_path, source_languaje],
|
417 |
-
outputs=[video_transcribed_progress_info]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
)
|
419 |
-
# video_transcribed_progress_info.change(
|
420 |
-
# fn=concatenate_transcriptions,
|
421 |
-
# outputs=[transcriptions_concatenated_progress_info, original_audio_transcribed_path]
|
422 |
-
# )
|
423 |
# transcriptions_concatenated_progress_info.change(
|
424 |
# fn=translate_transcription,
|
425 |
# inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
|
|
|
19 |
ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
|
20 |
BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
|
21 |
BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
|
22 |
+
html_social_media, get_html_subtify_logo_big, get_html_subtify_logo_small, html_buy_me_a_coffe
|
23 |
)
|
24 |
# from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
|
25 |
from slice_audio import slice_audio as slice_audio_main
|
26 |
from audio import get_audio_from_video
|
27 |
from transcribe import transcribe, get_language_dict
|
28 |
+
from diarize_library import diarize_audio
|
29 |
+
import json
|
30 |
|
31 |
NUMBER = 100
|
32 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
58 |
TWITCH = "twitch"
|
59 |
ERROR = "error"
|
60 |
|
61 |
+
VIEW_OUTPUTS = True
|
62 |
+
DEBUG = True
|
63 |
+
|
64 |
subtify_logo = Image.open("assets/subtify_logo-scaled.png")
|
65 |
subtify_logo_width, subtify_logo_height = subtify_logo.size
|
66 |
factor = 4
|
67 |
new_width = subtify_logo_width // factor
|
68 |
new_height = subtify_logo_height // factor
|
69 |
|
70 |
+
html_subtify_logo_big = get_html_subtify_logo_big(new_width, new_height)
|
71 |
+
html_subtify_logo_small = get_html_subtify_logo_small(new_width, new_height)
|
72 |
|
73 |
language_dict = union_language_dict()
|
74 |
|
|
|
124 |
return (
|
125 |
gr.update(value="Done"), # auxiliar_block1
|
126 |
gr.update(visible=True), # get_audio_from_video_info
|
127 |
+
gr.update(visible=True), # merged_transcription
|
128 |
gr.update(visible=True), # video_sliced_progress_info
|
129 |
gr.update(visible=True), # video_transcribed_progress_info
|
130 |
+
gr.update(visible=True), # diarization_progress_info
|
131 |
gr.update(visible=True), # transcriptions_concatenated_progress_info
|
132 |
gr.update(visible=True), # video_translated_progress_info
|
133 |
gr.update(visible=True), # video_subtitled_progress_info
|
|
|
136 |
def get_audio(video_path):
|
137 |
print('*'*NUMBER)
|
138 |
print(f"Getting audio from video {video_path}")
|
139 |
+
|
140 |
audios_folder = "audios"
|
141 |
+
|
142 |
+
if DEBUG:
|
143 |
+
audio_file = f"{audios_folder}/download_audio.mp3"
|
144 |
+
if os.path.exists(audio_file):
|
145 |
+
return [
|
146 |
+
gr.update(value="Loaded"), # get_audio_from_video_info
|
147 |
+
gr.update(value=audio_file) # original_audio_path
|
148 |
+
]
|
149 |
+
|
150 |
try:
|
151 |
audio_path = get_audio_from_video(video_path, audios_folder)
|
152 |
return [
|
|
|
179 |
gr.update(value="Ok"), # video_sliced_progress_info
|
180 |
)
|
181 |
|
182 |
+
def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
|
183 |
+
print('*'*NUMBER)
|
184 |
+
print(f"Diarize {input_audio_path}")
|
185 |
+
|
186 |
+
# Diarization file
|
187 |
+
diarization_file = f"diarization/diarization.json"
|
188 |
+
|
189 |
+
if DEBUG:
|
190 |
+
if os.path.exists(diarization_file):
|
191 |
+
with open(diarization_file, "r") as f:
|
192 |
+
diarization = f.read()
|
193 |
+
return [
|
194 |
+
gr.update(value="Loaded"),
|
195 |
+
gr.update(value=diarization)
|
196 |
+
]
|
197 |
+
|
198 |
+
# Diarize audio
|
199 |
+
diarization = diarize_audio(input_audio_path, num_speakers, min_speakers, max_speakers, DEVICE)
|
200 |
+
|
201 |
+
# Save diarization
|
202 |
+
with open(diarization_file, "w") as f:
|
203 |
+
json.dump(diarization, f)
|
204 |
+
|
205 |
+
return [
|
206 |
+
gr.update(value="Ok"),
|
207 |
+
gr.update(value=diarization)
|
208 |
+
]
|
209 |
+
|
210 |
def trascribe_audio(input_audio_path, source_languaje):
|
211 |
print('*'*NUMBER)
|
212 |
print(f"Transcript {input_audio_path}")
|
213 |
|
214 |
+
# Transcription file
|
215 |
+
transcription_file = f"transcriptions/transcription_{source_languaje}.json"
|
216 |
+
|
217 |
+
if DEBUG:
|
218 |
+
if os.path.exists(transcription_file):
|
219 |
+
transcription = open(transcription_file, "r").read()
|
220 |
+
transcription = json.loads(transcription)
|
221 |
+
return [
|
222 |
+
gr.update(value="Loaded"),
|
223 |
+
gr.update(value=transcription)
|
224 |
+
]
|
225 |
+
|
226 |
# Get language dict
|
227 |
language_dict = get_language_dict()
|
228 |
|
229 |
# Transcribe audio file
|
230 |
+
transcription_str, transcription_dict = transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
|
231 |
+
|
232 |
+
# Save transcription
|
233 |
+
with open(transcription_file, "w") as f:
|
234 |
+
transcription_json = json.dumps(transcription_dict)
|
235 |
+
f.write(transcription_json)
|
236 |
|
237 |
return (
|
238 |
+
gr.update(value="Ok"),
|
239 |
+
gr.update(value=transcription_dict)
|
240 |
)
|
241 |
|
242 |
def concatenate_transcriptions():
|
|
|
339 |
videos_folder = "videos"
|
340 |
if not os.path.exists(videos_folder):
|
341 |
os.makedirs(videos_folder)
|
342 |
+
|
343 |
+
if DEBUG:
|
344 |
+
video_file = f"{videos_folder}/download_video.mp4"
|
345 |
+
if os.path.exists(video_file):
|
346 |
+
return [
|
347 |
+
gr.update(label="Video uploaded"), # video_input
|
348 |
+
gr.update(visible=True), # config_block
|
349 |
+
gr.update(value=video_file), # original_video_path
|
350 |
+
gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
|
351 |
+
]
|
352 |
|
353 |
# Copy uploaded video to videos folder
|
354 |
new_video_path = os.path.join(videos_folder, "download_video.mp4")
|
|
|
358 |
return [
|
359 |
gr.update(label="Video uploaded"), # video_input
|
360 |
gr.update(visible=True), # config_block
|
361 |
+
gr.update(value=new_video_path), # original_video_path
|
362 |
+
gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
|
363 |
+
]
|
364 |
+
|
365 |
+
def merge_transcription_and_diarization():
|
366 |
+
"""
|
367 |
+
Combina la transcripci贸n y la diarizaci贸n para asignar speakers a cada palabra.
|
368 |
+
|
369 |
+
Returns:
|
370 |
+
dict: Transcripci贸n combinada con informaci贸n de speakers
|
371 |
+
"""
|
372 |
+
print('*'*NUMBER)
|
373 |
+
print("Merge transcription and diarization")
|
374 |
+
|
375 |
+
if DEBUG:
|
376 |
+
merged_transcription_path = "merged_transcription_diarization/merged.json"
|
377 |
+
if os.path.exists(merged_transcription_path):
|
378 |
+
with open(merged_transcription_path, 'r') as f:
|
379 |
+
merged_transcription = json.load(f)
|
380 |
+
return [
|
381 |
+
gr.update(value="Loaded"),
|
382 |
+
gr.update(value=merged_transcription)
|
383 |
+
]
|
384 |
+
|
385 |
+
transcription_path = "transcriptions/transcription_English.json"
|
386 |
+
diarization_path = "diarization/diarization.json"
|
387 |
+
# Cargar los archivos JSON
|
388 |
+
with open(transcription_path, 'r') as f:
|
389 |
+
transcription = json.load(f)
|
390 |
+
with open(diarization_path, 'r') as f:
|
391 |
+
diarization = json.load(f)
|
392 |
+
|
393 |
+
# Crear una nueva lista para los chunks combinados
|
394 |
+
merged_chunks = []
|
395 |
+
|
396 |
+
# Para cada palabra en la transcripci贸n
|
397 |
+
for chunk in transcription.get('chunks', []):
|
398 |
+
# Verificar que el chunk tiene timestamps v谩lidos
|
399 |
+
if not (isinstance(chunk.get('start'), (int, float)) and
|
400 |
+
isinstance(chunk.get('end'), (int, float))):
|
401 |
+
continue
|
402 |
+
|
403 |
+
word_start = float(chunk['start'])
|
404 |
+
word_end = float(chunk['end'])
|
405 |
+
|
406 |
+
# Encontrar el speaker correspondiente en la diarizaci贸n
|
407 |
+
speaker = None
|
408 |
+
for segment in diarization:
|
409 |
+
# Verificar que el segmento tiene timestamps v谩lidos
|
410 |
+
if not (isinstance(segment.get('start'), (int, float)) and
|
411 |
+
isinstance(segment.get('end'), (int, float))):
|
412 |
+
continue
|
413 |
+
|
414 |
+
segment_start = float(segment['start'])
|
415 |
+
segment_end = float(segment['end'])
|
416 |
+
|
417 |
+
# Si la palabra est谩 dentro del rango de tiempo del segmento
|
418 |
+
if (word_start >= segment_start and word_end <= segment_end):
|
419 |
+
speaker = segment['speaker']
|
420 |
+
break
|
421 |
+
|
422 |
+
# Si la palabra est谩 mayormente dentro del segmento (m谩s del 50% de su duraci贸n)
|
423 |
+
word_duration = word_end - word_start
|
424 |
+
overlap_start = max(word_start, segment_start)
|
425 |
+
overlap_end = min(word_end, segment_end)
|
426 |
+
overlap_duration = max(0, overlap_end - overlap_start)
|
427 |
+
|
428 |
+
if overlap_duration > word_duration * 0.5:
|
429 |
+
speaker = segment['speaker']
|
430 |
+
break
|
431 |
+
|
432 |
+
# Crear el nuevo chunk con la informaci贸n del speaker
|
433 |
+
merged_chunk = {
|
434 |
+
'start': word_start,
|
435 |
+
'end': word_end,
|
436 |
+
'text': chunk['text'],
|
437 |
+
'speaker': speaker if speaker else 'UNKNOWN'
|
438 |
+
}
|
439 |
+
merged_chunks.append(merged_chunk)
|
440 |
+
|
441 |
+
# Crear el diccionario final
|
442 |
+
merged_transcription = {
|
443 |
+
'text': transcription.get('text', ''),
|
444 |
+
'chunks': merged_chunks
|
445 |
+
}
|
446 |
+
|
447 |
+
# Crear el directorio si no existe
|
448 |
+
if not os.path.exists(merged_transcription_path):
|
449 |
+
os.makedirs(merged_transcription_path)
|
450 |
+
|
451 |
+
# Guardar el resultado en el nuevo directorio
|
452 |
+
with open(merged_transcription_path, 'w', encoding='utf-8') as f:
|
453 |
+
json.dump(merged_transcription, f, ensure_ascii=False, indent=2)
|
454 |
+
|
455 |
+
return [
|
456 |
+
gr.update(value="Ok"),
|
457 |
+
gr.update(value=merged_transcription)
|
458 |
]
|
459 |
|
460 |
@spaces.GPU
|
|
|
505 |
# Layout
|
506 |
gr.HTML(html_social_media)
|
507 |
gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
|
508 |
+
html_subtify_logo_component = gr.HTML(html_subtify_logo_big)
|
509 |
|
510 |
# Input block, where the user can upload a video and configure the subtify process
|
511 |
visible = False
|
|
|
526 |
with gr.Row():
|
527 |
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
|
528 |
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Espa帽ol", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
|
529 |
+
with gr.Accordion("Advanced settings", open=True, visible=True) as Advanced_setings:
|
530 |
+
number_of_speakers = gr.Number(visible=True, label="Number of speakers", show_label=True, value=0, interactive=True, info="Number of speakers in the video, if you don't know, select 0")
|
531 |
+
min_speakers = gr.Number(visible=True, label="Min speakers", show_label=True, value=0, scale=0, interactive=True, info="Minimum number of speakers in the video")
|
532 |
+
max_speakers = gr.Number(visible=True, label="Max speakers", show_label=True, value=0, scale=0, interactive=True, info="Maximum number of speakers in the video")
|
533 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
534 |
|
535 |
auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
|
536 |
with gr.Row():
|
537 |
get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
|
538 |
video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
|
539 |
+
diarization_progress_info = gr.Textbox(placeholder="Waiting", label="Diarize progress info", elem_id="diarization_progress_info", interactive=False, visible=visible)
|
540 |
+
merged_transcription_progress_info = gr.Textbox(placeholder="Waiting", label="Merge transcription and diarization progress info", elem_id="merged_transcription_progress_info", interactive=False, visible=visible)
|
541 |
transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
|
542 |
video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
|
543 |
video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
|
544 |
|
545 |
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
|
546 |
original_video_path = gr.Textbox(label="Original video path", visible=visible)
|
547 |
+
transcription = gr.Textbox(label="transcription", elem_id="transcription", visible=VIEW_OUTPUTS)
|
548 |
+
diarization = gr.Textbox(label="diarization", elem_id="diarization", visible=VIEW_OUTPUTS)
|
549 |
+
merged_transcription = gr.Textbox(label="merged_transcription", elem_id="merged_transcription", visible=VIEW_OUTPUTS)
|
550 |
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
|
551 |
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
|
552 |
auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
|
|
|
575 |
video_input.change(
|
576 |
fn=process_uploaded_video,
|
577 |
inputs=[video_input],
|
578 |
+
outputs=[video_input, config_block, original_video_path, html_subtify_logo_component]
|
579 |
)
|
580 |
subtify_button.click(
|
581 |
fn=change_visibility_texboxes,
|
582 |
+
outputs=[auxiliar_block1, get_audio_from_video_info, merged_transcription_progress_info, video_transcribed_progress_info, diarization_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
|
583 |
)
|
584 |
auxiliar_block1.change(
|
585 |
fn=get_audio,
|
|
|
589 |
get_audio_from_video_info.change(
|
590 |
fn=trascribe_audio,
|
591 |
inputs=[original_audio_path, source_languaje],
|
592 |
+
outputs=[video_transcribed_progress_info, transcription]
|
593 |
+
)
|
594 |
+
video_transcribed_progress_info.change(
|
595 |
+
fn=diarize,
|
596 |
+
inputs=[original_audio_path, number_of_speakers, min_speakers, max_speakers],
|
597 |
+
outputs=[diarization_progress_info, diarization]
|
598 |
+
)
|
599 |
+
diarization_progress_info.change(
|
600 |
+
fn=merge_transcription_and_diarization,
|
601 |
+
outputs=[merged_transcription_progress_info, merged_transcription]
|
602 |
)
|
|
|
|
|
|
|
|
|
603 |
# transcriptions_concatenated_progress_info.change(
|
604 |
# fn=translate_transcription,
|
605 |
# inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
|