Maximofn commited on
Commit
218960f
·
1 Parent(s): 9b5b5b7

Improve code documentation and add docstrings

Browse files

- Add comprehensive docstrings to functions in app.py and slice_audio.py
- Enhance code readability with clear function descriptions and parameter explanations
- Remove commented-out code and improve inline comments
- Standardize code comments and improve code clarity

Files changed (2) hide show
  1. app.py +44 -19
  2. slice_audio.py +45 -3
app.py CHANGED
@@ -39,9 +39,7 @@ TRANSLATE_TRANSCRIPTIONS = True
39
  ADD_SUBTITLES_TO_VIDEO = True
40
  REMOVE_FILES = True
41
  if DEVICE == "cpu":
42
- # I supose that I am on huggingface server
43
- # Get RAM space
44
- # ram = int(os.popen("free -m | grep Mem | awk '{print $2}'").read())
45
  ram = 16000
46
  factor = 1
47
  CHUNK_SECONDS = int(ram*factor)
@@ -49,8 +47,7 @@ if DEVICE == "cpu":
49
  CHUNK_OVERLAP_SECONDS = 5
50
  print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}")
51
  else:
52
- # I supose that I am on my computer
53
- # Get VRAM space
54
  CHUNK_SECONDS = 30
55
  CHUNK_OVERLAP_SECONDS = 5
56
 
@@ -73,6 +70,7 @@ html_subtify_logo_small = get_html_subtify_logo_small(new_width, new_height)
73
  language_dict = union_language_dict()
74
 
75
  def remove_all_files():
 
76
  if os.path.exists("audios"):
77
  command = f"rm -r audios"
78
  os.system(command)
@@ -96,6 +94,7 @@ def remove_all_files():
96
  os.system(command)
97
 
98
  def reset_frontend():
 
99
  visible = False
100
  return (
101
  None,
@@ -118,9 +117,11 @@ def reset_frontend():
118
  )
119
 
120
  def show_auxiliar_block1():
 
121
  return gr.Textbox(value="URL checked", visible=False)
122
 
123
  def change_visibility_texboxes():
 
124
  return (
125
  gr.update(value="Done"), # auxiliar_block1
126
  gr.update(visible=True), # get_audio_from_video_info
@@ -134,6 +135,15 @@ def change_visibility_texboxes():
134
  )
135
 
136
  def get_audio(video_path):
 
 
 
 
 
 
 
 
 
137
  print('*'*NUMBER)
138
  print(f"Getting audio from video {video_path}")
139
 
@@ -161,6 +171,12 @@ def get_audio(video_path):
161
  ]
162
 
163
  def slice_audio(input_audio_path):
 
 
 
 
 
 
164
  print('*'*NUMBER)
165
  print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap")
166
 
@@ -180,6 +196,15 @@ def slice_audio(input_audio_path):
180
  )
181
 
182
  def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
 
 
 
 
 
 
 
 
 
183
  print('*'*NUMBER)
184
  print(f"Diarize {input_audio_path}")
185
 
@@ -364,10 +389,10 @@ def process_uploaded_video(video_path):
364
 
365
  def merge_transcription_and_diarization():
366
  """
367
- Combina la transcripción y la diarización para asignar speakers a cada palabra.
368
 
369
  Returns:
370
- dict: Transcripción combinada con información de speakers
371
  """
372
  print('*'*NUMBER)
373
  print("Merge transcription and diarization")
@@ -382,20 +407,20 @@ def merge_transcription_and_diarization():
382
  gr.update(value=merged_transcription)
383
  ]
384
 
 
385
  transcription_path = "transcriptions/transcription_English.json"
386
  diarization_path = "diarization/diarization.json"
387
- # Cargar los archivos JSON
388
  with open(transcription_path, 'r') as f:
389
  transcription = json.load(f)
390
  with open(diarization_path, 'r') as f:
391
  diarization = json.load(f)
392
 
393
- # Crear una nueva lista para los chunks combinados
394
  merged_chunks = []
395
 
396
- # Para cada palabra en la transcripción
397
  for chunk in transcription.get('chunks', []):
398
- # Verificar que el chunk tiene timestamps válidos
399
  if not (isinstance(chunk.get('start'), (int, float)) and
400
  isinstance(chunk.get('end'), (int, float))):
401
  continue
@@ -403,10 +428,10 @@ def merge_transcription_and_diarization():
403
  word_start = float(chunk['start'])
404
  word_end = float(chunk['end'])
405
 
406
- # Encontrar el speaker correspondiente en la diarización
407
  speaker = None
408
  for segment in diarization:
409
- # Verificar que el segmento tiene timestamps válidos
410
  if not (isinstance(segment.get('start'), (int, float)) and
411
  isinstance(segment.get('end'), (int, float))):
412
  continue
@@ -414,12 +439,12 @@ def merge_transcription_and_diarization():
414
  segment_start = float(segment['start'])
415
  segment_end = float(segment['end'])
416
 
417
- # Si la palabra está dentro del rango de tiempo del segmento
418
  if (word_start >= segment_start and word_end <= segment_end):
419
  speaker = segment['speaker']
420
  break
421
 
422
- # Si la palabra está mayormente dentro del segmento (más del 50% de su duración)
423
  word_duration = word_end - word_start
424
  overlap_start = max(word_start, segment_start)
425
  overlap_end = min(word_end, segment_end)
@@ -429,7 +454,7 @@ def merge_transcription_and_diarization():
429
  speaker = segment['speaker']
430
  break
431
 
432
- # Crear el nuevo chunk con la información del speaker
433
  merged_chunk = {
434
  'start': word_start,
435
  'end': word_end,
@@ -438,17 +463,17 @@ def merge_transcription_and_diarization():
438
  }
439
  merged_chunks.append(merged_chunk)
440
 
441
- # Crear el diccionario final
442
  merged_transcription = {
443
  'text': transcription.get('text', ''),
444
  'chunks': merged_chunks
445
  }
446
 
447
- # Crear el directorio si no existe
448
  if not os.path.exists(merged_transcription_path):
449
  os.makedirs(merged_transcription_path)
450
 
451
- # Guardar el resultado en el nuevo directorio
452
  with open(merged_transcription_path, 'w', encoding='utf-8') as f:
453
  json.dump(merged_transcription, f, ensure_ascii=False, indent=2)
454
 
 
39
  ADD_SUBTITLES_TO_VIDEO = True
40
  REMOVE_FILES = True
41
  if DEVICE == "cpu":
42
+ # Assuming we are on huggingface server
 
 
43
  ram = 16000
44
  factor = 1
45
  CHUNK_SECONDS = int(ram*factor)
 
47
  CHUNK_OVERLAP_SECONDS = 5
48
  print(f"RAM: {ram}, CHUNK_SECONDS: {CHUNK_SECONDS}, CHUNK_OVERLAP_SECONDS: {CHUNK_OVERLAP_SECONDS}")
49
  else:
50
+ # Assuming we are on local machine
 
51
  CHUNK_SECONDS = 30
52
  CHUNK_OVERLAP_SECONDS = 5
53
 
 
70
  language_dict = union_language_dict()
71
 
72
  def remove_all_files():
73
+ """Remove all temporary files and folders"""
74
  if os.path.exists("audios"):
75
  command = f"rm -r audios"
76
  os.system(command)
 
94
  os.system(command)
95
 
96
  def reset_frontend():
97
+ """Reset all frontend elements to their default state"""
98
  visible = False
99
  return (
100
  None,
 
117
  )
118
 
119
  def show_auxiliar_block1():
120
+ """Show auxiliary block 1 with URL checked message"""
121
  return gr.Textbox(value="URL checked", visible=False)
122
 
123
  def change_visibility_texboxes():
124
+ """Change visibility of progress info textboxes"""
125
  return (
126
  gr.update(value="Done"), # auxiliar_block1
127
  gr.update(visible=True), # get_audio_from_video_info
 
135
  )
136
 
137
  def get_audio(video_path):
138
+ """
139
+ Extract audio from video file.
140
+
141
+ Args:
142
+ video_path (str): Path to video file
143
+
144
+ Returns:
145
+ list: Status update and audio file path
146
+ """
147
  print('*'*NUMBER)
148
  print(f"Getting audio from video {video_path}")
149
 
 
171
  ]
172
 
173
  def slice_audio(input_audio_path):
174
+ """
175
+ Slice audio into chunks.
176
+
177
+ Args:
178
+ input_audio_path (str): Path to input audio file
179
+ """
180
  print('*'*NUMBER)
181
  print(f"Slicing audio {input_audio_path} in chunks of {CHUNK_SECONDS} seconds with {CHUNK_OVERLAP_SECONDS} seconds overlap")
182
 
 
196
  )
197
 
198
  def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
199
+ """
200
+ Perform speaker diarization on audio file.
201
+
202
+ Args:
203
+ input_audio_path (str): Path to audio file
204
+ num_speakers (int): Expected number of speakers
205
+ min_speakers (int): Minimum number of speakers
206
+ max_speakers (int): Maximum number of speakers
207
+ """
208
  print('*'*NUMBER)
209
  print(f"Diarize {input_audio_path}")
210
 
 
389
 
390
  def merge_transcription_and_diarization():
391
  """
392
+ Merge transcription and diarization results to assign speakers to each word.
393
 
394
  Returns:
395
+ dict: Combined transcription with speaker information
396
  """
397
  print('*'*NUMBER)
398
  print("Merge transcription and diarization")
 
407
  gr.update(value=merged_transcription)
408
  ]
409
 
410
+ # Load JSON files
411
  transcription_path = "transcriptions/transcription_English.json"
412
  diarization_path = "diarization/diarization.json"
 
413
  with open(transcription_path, 'r') as f:
414
  transcription = json.load(f)
415
  with open(diarization_path, 'r') as f:
416
  diarization = json.load(f)
417
 
418
+ # Create new list for combined chunks
419
  merged_chunks = []
420
 
421
+ # For each word in transcription
422
  for chunk in transcription.get('chunks', []):
423
+ # Verify chunk has valid timestamps
424
  if not (isinstance(chunk.get('start'), (int, float)) and
425
  isinstance(chunk.get('end'), (int, float))):
426
  continue
 
428
  word_start = float(chunk['start'])
429
  word_end = float(chunk['end'])
430
 
431
+ # Find corresponding speaker in diarization
432
  speaker = None
433
  for segment in diarization:
434
+ # Verify segment has valid timestamps
435
  if not (isinstance(segment.get('start'), (int, float)) and
436
  isinstance(segment.get('end'), (int, float))):
437
  continue
 
439
  segment_start = float(segment['start'])
440
  segment_end = float(segment['end'])
441
 
442
+ # If word is within segment time range
443
  if (word_start >= segment_start and word_end <= segment_end):
444
  speaker = segment['speaker']
445
  break
446
 
447
+ # If word is mostly within segment (>50% duration)
448
  word_duration = word_end - word_start
449
  overlap_start = max(word_start, segment_start)
450
  overlap_end = min(word_end, segment_end)
 
454
  speaker = segment['speaker']
455
  break
456
 
457
+ # Create new chunk with speaker information
458
  merged_chunk = {
459
  'start': word_start,
460
  'end': word_end,
 
463
  }
464
  merged_chunks.append(merged_chunk)
465
 
466
+ # Create final dictionary
467
  merged_transcription = {
468
  'text': transcription.get('text', ''),
469
  'chunks': merged_chunks
470
  }
471
 
472
+ # Create directory if it doesn't exist
473
  if not os.path.exists(merged_transcription_path):
474
  os.makedirs(merged_transcription_path)
475
 
476
+ # Save result to new directory
477
  with open(merged_transcription_path, 'w', encoding='utf-8') as f:
478
  json.dump(merged_transcription, f, ensure_ascii=False, indent=2)
479
 
slice_audio.py CHANGED
@@ -7,6 +7,19 @@ FOLDER = "chunks"
7
  DEBUG = True
8
 
9
  def seconds_to_hms(seconds):
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  hour = 00
11
  minute = 00
12
  second = seconds
@@ -21,17 +34,46 @@ def seconds_to_hms(seconds):
21
  return hour, minute, second
22
 
23
  def hms_to_seconds(hour, minute, second):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  return hour*3600 + minute*60 + second
25
 
26
  def slice_audio(input_audio_path, output_folder, chunks_seconds, chunk_overlap_seconds):
27
  """
28
  Slice audio into chunks with specified duration and overlap.
29
 
 
 
 
30
  Args:
31
- input_audio_path (str): Path to input audio file
32
- output_folder (str): Path to output folder
33
  chunks_seconds (int): Duration of each chunk in seconds
34
- chunk_overlap_seconds (int): Overlap between chunks in seconds
 
 
 
 
 
 
 
 
 
 
 
35
  """
36
  _, filename = os.path.split(input_audio_path)
37
  name, extension = os.path.splitext(filename)
 
7
  DEBUG = True
8
 
9
  def seconds_to_hms(seconds):
10
+ """
11
+ Convert seconds to hours, minutes, seconds format.
12
+
13
+ Args:
14
+ seconds (int): Total number of seconds to convert
15
+
16
+ Returns:
17
+ tuple: A tuple containing (hours, minutes, seconds)
18
+
19
+ Example:
20
+ >>> seconds_to_hms(3665)
21
+ (1, 1, 5) # 1 hour, 1 minute, 5 seconds
22
+ """
23
  hour = 00
24
  minute = 00
25
  second = seconds
 
34
  return hour, minute, second
35
 
36
  def hms_to_seconds(hour, minute, second):
37
+ """
38
+ Convert hours, minutes, seconds to total seconds.
39
+
40
+ Args:
41
+ hour (int): Number of hours
42
+ minute (int): Number of minutes
43
+ second (int): Number of seconds
44
+
45
+ Returns:
46
+ int: Total number of seconds
47
+
48
+ Example:
49
+ >>> hms_to_seconds(1, 1, 5)
50
+ 3665 # 1 hour + 1 minute + 5 seconds in seconds
51
+ """
52
  return hour*3600 + minute*60 + second
53
 
54
  def slice_audio(input_audio_path, output_folder, chunks_seconds, chunk_overlap_seconds):
55
  """
56
  Slice audio into chunks with specified duration and overlap.
57
 
58
+ This function takes an audio file and splits it into smaller chunks with a specified
59
+ duration and overlap between chunks. It uses ffmpeg for the actual audio processing.
60
+
61
  Args:
62
+ input_audio_path (str): Path to the input audio file
63
+ output_folder (str): Directory where the chunks will be saved
64
  chunks_seconds (int): Duration of each chunk in seconds
65
+ chunk_overlap_seconds (int): Amount of overlap between consecutive chunks in seconds
66
+
67
+ Returns:
68
+ None: Creates audio chunks in the specified output folder and generates
69
+ a text file listing all chunk files
70
+
71
+ Raises:
72
+ ValueError: If chunk_overlap_seconds is greater than or equal to chunks_seconds
73
+
74
+ Example:
75
+ >>> slice_audio("input.mp3", "chunks", 30, 5)
76
+ # Creates chunks of 30 seconds with 5 seconds overlap
77
  """
78
  _, filename = os.path.split(input_audio_path)
79
  name, extension = os.path.splitext(filename)