katospiegel commited on
Commit
1c97ed5
·
1 Parent(s): 6934a38

feat: Functional gradio app

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. Dockerfile +4 -1
  3. app/app.py +9 -4
  4. app/gradio_app.py +77 -53
.gitignore CHANGED
@@ -1,6 +1,7 @@
1
  # ODTP dev
2
  odtp-input
3
  odtp-output
 
4
 
5
  # Mac crap
6
  .DS_Store
 
1
  # ODTP dev
2
  odtp-input
3
  odtp-output
4
+ odtp-logs
5
 
6
  # Mac crap
7
  .DS_Store
Dockerfile CHANGED
@@ -66,4 +66,7 @@ RUN sed -i 's/\r$//' /odtp/odtp-component-client/odtp-app.sh
66
  RUN sed -i 's/\r$//' /odtp/odtp-component-client/startup.sh
67
  RUN sed -i 's/\r$//' /odtp/odtp-app/app.sh
68
 
69
- ENTRYPOINT ["bash", "/odtp/odtp-component-client/startup.sh"]
 
 
 
 
66
  RUN sed -i 's/\r$//' /odtp/odtp-component-client/startup.sh
67
  RUN sed -i 's/\r$//' /odtp/odtp-app/app.sh
68
 
69
+ #ENTRYPOINT ["bash", "/odtp/odtp-component-client/startup.sh"]
70
+ ENTRYPOINT [ "python3", "/odtp/odtp-app/gradio_app.py" ]
71
+
72
+ # Create command to run the app that goes to an entrypoint basically the startup mode. Also I in order to work with an API I need some interface with an s3 to make it work?
app/app.py CHANGED
@@ -321,7 +321,8 @@ def main(args):
321
  if args.language:
322
  whisper_options["language"] = args.language
323
  writer_options = {"max_line_width":55, "max_line_count":2, "word_timestamps": False}
324
- print("Process diarized blocks")
 
325
 
326
  # Group consecutive segments of the same speaker
327
  grouped_segments = []
@@ -330,9 +331,11 @@ def main(args):
330
  current_end = None
331
 
332
  for turn, _, speaker in diarization.itertracks(yield_label=True):
333
- print(speaker)
 
334
  if turn.end - turn.start < 0.5: # Suppress short utterances (pyannote artifact)
335
- print(f"start={turn.start:.1f}s stop={turn.end:.1f}s IGNORED")
 
336
  continue
337
 
338
  if speaker == current_speaker:
@@ -354,7 +357,8 @@ def main(args):
354
  clip_audio(args.input_file, sample_rate, start, end, clip_path)
355
  result = model.transcribe(start=start, end=end, options=whisper_options)
356
  language = result['language']
357
- print(f"start={start:.1f}s stop={end:.1f}s lang={language} {speaker}")
 
358
  writer(result, args.output_file, speaker, start, writer_options)
359
  writer_json(generate_segments(result['segments'], speaker, language), args.output_json_file)
360
  writer_json.finalize()
@@ -369,6 +373,7 @@ if __name__ == '__main__':
369
  parser.add_argument('--input-file', type=str, required=True, help="Input audio file")
370
  parser.add_argument('--output-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
371
  parser.add_argument('--output-json-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
 
372
 
373
  args = parser.parse_args()
374
  main(args)
 
321
  if args.language:
322
  whisper_options["language"] = args.language
323
  writer_options = {"max_line_width":55, "max_line_count":2, "word_timestamps": False}
324
+ if args.verbose=="True":
325
+ print("Process diarized blocks")
326
 
327
  # Group consecutive segments of the same speaker
328
  grouped_segments = []
 
331
  current_end = None
332
 
333
  for turn, _, speaker in diarization.itertracks(yield_label=True):
334
+ if args.verbose=="True":
335
+ print(speaker)
336
  if turn.end - turn.start < 0.5: # Suppress short utterances (pyannote artifact)
337
+ if args.verbose=="True":
338
+ print(f"start={turn.start:.1f}s stop={turn.end:.1f}s IGNORED")
339
  continue
340
 
341
  if speaker == current_speaker:
 
357
  clip_audio(args.input_file, sample_rate, start, end, clip_path)
358
  result = model.transcribe(start=start, end=end, options=whisper_options)
359
  language = result['language']
360
+ if args.verbose=="True":
361
+ print(f"start={start:.1f}s stop={end:.1f}s lang={language} {speaker}")
362
  writer(result, args.output_file, speaker, start, writer_options)
363
  writer_json(generate_segments(result['segments'], speaker, language), args.output_json_file)
364
  writer_json.finalize()
 
373
  parser.add_argument('--input-file', type=str, required=True, help="Input audio file")
374
  parser.add_argument('--output-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
375
  parser.add_argument('--output-json-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
376
+ parser.add_argument('--verbose', type=str, required=False, help="Printing status")
377
 
378
  args = parser.parse_args()
379
  main(args)
app/gradio_app.py CHANGED
@@ -3,8 +3,8 @@ import tempfile
3
  import os
4
  import shutil
5
  import subprocess
6
- from pathlib import Path
7
- import io
8
 
9
  def create_temp_structure():
10
  """Create temporary ODTP folder structure"""
@@ -13,6 +13,11 @@ def create_temp_structure():
13
  os.makedirs(os.path.join(temp_dir, "odtp-output"))
14
  return temp_dir
15
 
 
 
 
 
 
16
  def cleanup_temp(temp_dir):
17
  """Remove temporary folder structure"""
18
  shutil.rmtree(temp_dir)
@@ -22,54 +27,64 @@ def process_audio(audio_file, model, task, language, hf_token):
22
  # Create temp structure
23
  temp_dir = create_temp_structure()
24
 
25
- try:
26
- # Copy input file
27
- input_path = os.path.join(temp_dir, "odtp-input", "input.wav")
28
- shutil.copy2(audio_file, input_path)
29
-
30
- # Prepare output paths
31
- output_base = "output"
32
- output_srt = os.path.join(temp_dir, "odtp-output",
33
- f"{output_base}.{'translate.' if task == 'translate' else ''}srt")
34
- output_json = os.path.join(temp_dir, "odtp-output",
35
- f"{output_base}.{'translate.' if task == 'translate' else ''}json")
36
-
37
- # Build command
38
- cmd = [
39
- "python3", "/odtp/odtp-app/app.py",
40
- "--model", model,
41
- "--quantize",
42
- "--hf-token", hf_token,
43
- "--task", task,
44
- "--input-file", input_path,
45
- "--output-file", output_srt,
46
- "--output-json-file", output_json
47
- ]
48
-
49
- if language != "auto":
50
- cmd.extend(["--language", language])
51
-
52
- # Run transcription
53
- subprocess.run(cmd, check=True)
54
-
55
- # Read results
56
- with open(output_srt, 'r', encoding='utf-8') as f:
57
- srt_content = f.read()
58
- with open(output_json, 'r', encoding='utf-8') as f:
59
- json_content = f.read()
60
-
61
- # Create BytesIO objects for downloads
62
- srt_bytes = io.BytesIO(srt_content.encode('utf-8'))
63
- srt_bytes.name = "output.srt"
64
- json_bytes = io.BytesIO(json_content.encode('utf-8'))
65
- json_bytes.name = "output.json"
66
-
67
- # Return contents and BytesIO objects
68
- return srt_content, json_content, srt_bytes, json_bytes
69
 
70
- finally:
71
- # Cleanup
72
- cleanup_temp(temp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # Define Gradio interface
75
  with gr.Blocks() as demo:
@@ -103,6 +118,9 @@ with gr.Blocks() as demo:
103
  submit_btn = gr.Button("Process Audio")
104
 
105
  with gr.Column():
 
 
 
106
  srt_output = gr.Textbox(
107
  label="SRT Output",
108
  lines=10
@@ -113,16 +131,18 @@ with gr.Blocks() as demo:
113
  )
114
  # Add download buttons
115
  srt_download = gr.File(
116
- label="Download SRT File"
 
117
  )
118
  json_download = gr.File(
119
- label="Download JSON File"
 
120
  )
121
 
122
  submit_btn.click(
123
  fn=process_audio,
124
  inputs=[audio_input, model, task, language, hf_token],
125
- outputs=[srt_output, json_output, srt_download, json_download]
126
  )
127
 
128
  if __name__ == "__main__":
@@ -132,4 +152,8 @@ if __name__ == "__main__":
132
  share=False, # Disable temporary public URL
133
  show_error=True, # Show detailed error messages
134
  debug=True # Enable debug mode for development
135
- )
 
 
 
 
 
3
  import os
4
  import shutil
5
  import subprocess
6
+ import threading
7
+ import time
8
 
9
  def create_temp_structure():
10
  """Create temporary ODTP folder structure"""
 
13
  os.makedirs(os.path.join(temp_dir, "odtp-output"))
14
  return temp_dir
15
 
16
+ def remove_later(path, delay):
17
+ time.sleep(delay)
18
+ if os.path.exists(path):
19
+ shutil.rmtree(path, ignore_errors=True)
20
+
21
  def cleanup_temp(temp_dir):
22
  """Remove temporary folder structure"""
23
  shutil.rmtree(temp_dir)
 
27
  # Create temp structure
28
  temp_dir = create_temp_structure()
29
 
30
+ start_time = time.time()
31
+ print(f"Processing started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
32
+
33
+ # Copy input file
34
+ input_path = os.path.join(temp_dir, "odtp-input", "input.wav")
35
+ shutil.copy2(audio_file, input_path)
36
+
37
+ # Prepare output paths #TODO: Add uuid to output file names
38
+ output_base = audio_file.split("/")[-1].replace(".wav", "")
39
+ output_srt = os.path.join(temp_dir, "odtp-output", #temp_dir
40
+ f"{output_base}_{task}.srt")
41
+ output_json = os.path.join(temp_dir, "odtp-output",
42
+ f"{output_base}_{task}.json")
43
+
44
+ # Use HF_TOKEN from environment if not provided
45
+ if not hf_token:
46
+ hf_token = os.getenv("HF_TOKEN")
47
+ if not hf_token:
48
+ raise ValueError("Hugging Face token is required but not provided.")
49
+
50
+ # Build command
51
+ cmd = [
52
+ "python3", "/odtp/odtp-app/app.py",
53
+ "--model", model,
54
+ "--quantize",
55
+ "--hf-token", hf_token,
56
+ "--task", task,
57
+ "--input-file", input_path,
58
+ "--output-file", output_srt,
59
+ "--output-json-file", output_json,
60
+ "--verbose", "False"
61
+ ]
62
+
63
+ if language != "auto":
64
+ cmd.extend(["--language", language])
 
 
 
 
 
 
 
 
 
65
 
66
+ # Run transcription
67
+ subprocess.run(cmd, check=True)
68
+
69
+ # Read results
70
+ with open(output_srt, 'r', encoding='utf-8') as f:
71
+ srt_content = f.read()
72
+ with open(output_json, 'r', encoding='utf-8') as f:
73
+ json_content = f.read()
74
+
75
+ # Code to delete files after 300 seconds
76
+ threading.Thread(target=remove_later, args=(temp_dir, 300), daemon=True).start()
77
+
78
+ end_time = time.time()
79
+ print(f"Processing ended at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
80
+
81
+ total_duration = end_time - start_time
82
+ hours, remainder = divmod(total_duration, 3600)
83
+ minutes, seconds = divmod(remainder, 60)
84
+ total_duration_str = f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
85
+ print(f"Total processing time: {total_duration_str}")
86
+
87
+ return total_duration_str, srt_content, json_content, output_srt, output_json
88
 
89
  # Define Gradio interface
90
  with gr.Blocks() as demo:
 
118
  submit_btn = gr.Button("Process Audio")
119
 
120
  with gr.Column():
121
+ information = gr.Text(
122
+ label="Information"
123
+ )
124
  srt_output = gr.Textbox(
125
  label="SRT Output",
126
  lines=10
 
131
  )
132
  # Add download buttons
133
  srt_download = gr.File(
134
+ label="Download SRT File",
135
+ type="binary"
136
  )
137
  json_download = gr.File(
138
+ label="Download JSON File",
139
+ type="binary"
140
  )
141
 
142
  submit_btn.click(
143
  fn=process_audio,
144
  inputs=[audio_input, model, task, language, hf_token],
145
+ outputs=[information, srt_output, json_output, srt_download, json_download]
146
  )
147
 
148
  if __name__ == "__main__":
 
152
  share=False, # Disable temporary public URL
153
  show_error=True, # Show detailed error messages
154
  debug=True # Enable debug mode for development
155
+ )
156
+
157
+
158
+
159
+ # TODO: Slow printing on the command.