Spaces:

katospiegel
/

odtp-pyannote-whisper

Running

App Files Files Community

katospiegel commited on Jan 16

Commit

1c97ed5

1 Parent(s): 6934a38

feat: Functional gradio app

Browse files

Files changed (4) hide show

.gitignore +1 -0
Dockerfile +4 -1
app/app.py +9 -4
app/gradio_app.py +77 -53

.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
 # ODTP dev
 odtp-input
 odtp-output
 # Mac crap
 .DS_Store

 # ODTP dev
 odtp-input
 odtp-output
+odtp-logs
 # Mac crap
 .DS_Store

Dockerfile CHANGED Viewed

@@ -66,4 +66,7 @@ RUN sed -i 's/\r$//' /odtp/odtp-component-client/odtp-app.sh
 RUN sed -i 's/\r$//' /odtp/odtp-component-client/startup.sh
 RUN sed -i 's/\r$//' /odtp/odtp-app/app.sh
-ENTRYPOINT ["bash", "/odtp/odtp-component-client/startup.sh"]

 RUN sed -i 's/\r$//' /odtp/odtp-component-client/startup.sh
 RUN sed -i 's/\r$//' /odtp/odtp-app/app.sh
+#ENTRYPOINT ["bash", "/odtp/odtp-component-client/startup.sh"]
+ENTRYPOINT [ "python3", "/odtp/odtp-app/gradio_app.py" ]
+# Create command to run the app that goes to an entrypoint basically the startup mode. Also I in order to work with an API I need some interface with an s3 to make it work?

app/app.py CHANGED Viewed

@@ -321,7 +321,8 @@ def main(args):
     if args.language:
         whisper_options["language"] = args.language
     writer_options = {"max_line_width":55, "max_line_count":2, "word_timestamps": False}
-    print("Process diarized blocks")
     # Group consecutive segments of the same speaker
     grouped_segments = []
@@ -330,9 +331,11 @@ def main(args):
     current_end = None
     for turn, _, speaker in diarization.itertracks(yield_label=True):
-        print(speaker)
         if turn.end - turn.start < 0.5:  # Suppress short utterances (pyannote artifact)
-            print(f"start={turn.start:.1f}s stop={turn.end:.1f}s IGNORED")
             continue
         if speaker == current_speaker:
@@ -354,7 +357,8 @@ def main(args):
         clip_audio(args.input_file, sample_rate, start, end, clip_path)
         result = model.transcribe(start=start, end=end, options=whisper_options)
         language = result['language']
-        print(f"start={start:.1f}s stop={end:.1f}s lang={language} {speaker}")
         writer(result, args.output_file, speaker, start, writer_options)
         writer_json(generate_segments(result['segments'],  speaker, language), args.output_json_file)
     writer_json.finalize()
@@ -369,6 +373,7 @@ if __name__ == '__main__':
     parser.add_argument('--input-file', type=str, required=True, help="Input audio file")
     parser.add_argument('--output-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
     parser.add_argument('--output-json-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
     args = parser.parse_args()
     main(args)

     if args.language:
         whisper_options["language"] = args.language
     writer_options = {"max_line_width":55, "max_line_count":2, "word_timestamps": False}
+    if args.verbose=="True":
+        print("Process diarized blocks")
     # Group consecutive segments of the same speaker
     grouped_segments = []
     current_end = None
     for turn, _, speaker in diarization.itertracks(yield_label=True):
+        if args.verbose=="True":
+            print(speaker)
         if turn.end - turn.start < 0.5:  # Suppress short utterances (pyannote artifact)
+            if args.verbose=="True":
+                print(f"start={turn.start:.1f}s stop={turn.end:.1f}s IGNORED")
             continue
         if speaker == current_speaker:
         clip_audio(args.input_file, sample_rate, start, end, clip_path)
         result = model.transcribe(start=start, end=end, options=whisper_options)
         language = result['language']
+        if args.verbose=="True":
+            print(f"start={start:.1f}s stop={end:.1f}s lang={language} {speaker}")
         writer(result, args.output_file, speaker, start, writer_options)
         writer_json(generate_segments(result['segments'],  speaker, language), args.output_json_file)
     writer_json.finalize()
     parser.add_argument('--input-file', type=str, required=True, help="Input audio file")
     parser.add_argument('--output-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
     parser.add_argument('--output-json-file', type=str, required=True, help="Output file for the results (SRT or VTT)")
+    parser.add_argument('--verbose', type=str, required=False, help="Printing status")
     args = parser.parse_args()
     main(args)

app/gradio_app.py CHANGED Viewed

@@ -3,8 +3,8 @@ import tempfile
 import os
 import shutil
 import subprocess
-from pathlib import Path
-import io
 def create_temp_structure():
     """Create temporary ODTP folder structure"""
@@ -13,6 +13,11 @@ def create_temp_structure():
     os.makedirs(os.path.join(temp_dir, "odtp-output"))
     return temp_dir
 def cleanup_temp(temp_dir):
     """Remove temporary folder structure"""
     shutil.rmtree(temp_dir)
@@ -22,54 +27,64 @@ def process_audio(audio_file, model, task, language, hf_token):
     # Create temp structure
     temp_dir = create_temp_structure()
-    try:
-        # Copy input file
-        input_path = os.path.join(temp_dir, "odtp-input", "input.wav")
-        shutil.copy2(audio_file, input_path)
-        # Prepare output paths
-        output_base = "output"
-        output_srt = os.path.join(temp_dir, "odtp-output",
-            f"{output_base}.{'translate.' if task == 'translate' else ''}srt")
-        output_json = os.path.join(temp_dir, "odtp-output",
-            f"{output_base}.{'translate.' if task == 'translate' else ''}json")
-        # Build command
-        cmd = [
-            "python3", "/odtp/odtp-app/app.py",
-            "--model", model,
-            "--quantize",
-            "--hf-token", hf_token,
-            "--task", task,
-            "--input-file", input_path,
-            "--output-file", output_srt,
-            "--output-json-file", output_json
-        ]
-        if language != "auto":
-            cmd.extend(["--language", language])
-        # Run transcription
-        subprocess.run(cmd, check=True)
-        # Read results
-        with open(output_srt, 'r', encoding='utf-8') as f:
-            srt_content = f.read()
-        with open(output_json, 'r', encoding='utf-8') as f:
-            json_content = f.read()
-        # Create BytesIO objects for downloads
-        srt_bytes = io.BytesIO(srt_content.encode('utf-8'))
-        srt_bytes.name = "output.srt"
-        json_bytes = io.BytesIO(json_content.encode('utf-8'))
-        json_bytes.name = "output.json"
-        # Return contents and BytesIO objects
-        return srt_content, json_content, srt_bytes, json_bytes
-    finally:
-        # Cleanup
-        cleanup_temp(temp_dir)
 # Define Gradio interface
 with gr.Blocks() as demo:
@@ -103,6 +118,9 @@ with gr.Blocks() as demo:
             submit_btn = gr.Button("Process Audio")
         with gr.Column():
             srt_output = gr.Textbox(
                 label="SRT Output",
                 lines=10
@@ -113,16 +131,18 @@ with gr.Blocks() as demo:
             )
             # Add download buttons
             srt_download = gr.File(
-                label="Download SRT File"
             )
             json_download = gr.File(
-                label="Download JSON File"
             )
     submit_btn.click(
         fn=process_audio,
         inputs=[audio_input, model, task, language, hf_token],
-        outputs=[srt_output, json_output, srt_download, json_download]
     )
 if __name__ == "__main__":
@@ -132,4 +152,8 @@ if __name__ == "__main__":
         share=False,              # Disable temporary public URL
         show_error=True,          # Show detailed error messages
         debug=True               # Enable debug mode for development
-    )

 import os
 import shutil
 import subprocess
+import threading
+import time
 def create_temp_structure():
     """Create temporary ODTP folder structure"""
     os.makedirs(os.path.join(temp_dir, "odtp-output"))
     return temp_dir
+def remove_later(path, delay):
+    time.sleep(delay)
+    if os.path.exists(path):
+        shutil.rmtree(path, ignore_errors=True)
 def cleanup_temp(temp_dir):
     """Remove temporary folder structure"""
     shutil.rmtree(temp_dir)
     # Create temp structure
     temp_dir = create_temp_structure()
+    start_time = time.time()
+    print(f"Processing started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
+    # Copy input file
+    input_path = os.path.join(temp_dir, "odtp-input", "input.wav")
+    shutil.copy2(audio_file, input_path)
+    # Prepare output paths #TODO: Add uuid to output file names
+    output_base = audio_file.split("/")[-1].replace(".wav", "")
+    output_srt = os.path.join(temp_dir, "odtp-output", #temp_dir
+        f"{output_base}_{task}.srt")
+    output_json = os.path.join(temp_dir, "odtp-output",
+        f"{output_base}_{task}.json")
+    # Use HF_TOKEN from environment if not provided
+    if not hf_token:
+        hf_token = os.getenv("HF_TOKEN")
+        if not hf_token:
+            raise ValueError("Hugging Face token is required but not provided.")
+    # Build command
+    cmd = [
+        "python3", "/odtp/odtp-app/app.py",
+        "--model", model,
+        "--quantize",
+        "--hf-token", hf_token,
+        "--task", task,
+        "--input-file", input_path,
+        "--output-file", output_srt,
+        "--output-json-file", output_json,
+        "--verbose", "False"
+    ]
+    if language != "auto":
+        cmd.extend(["--language", language])
+    # Run transcription
+    subprocess.run(cmd, check=True)
+    # Read results
+    with open(output_srt, 'r', encoding='utf-8') as f:
+        srt_content = f.read()
+    with open(output_json, 'r', encoding='utf-8') as f:
+        json_content = f.read()
+    # Code to delete files after 300 seconds
+    threading.Thread(target=remove_later, args=(temp_dir, 300), daemon=True).start()
+    end_time = time.time()
+    print(f"Processing ended at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
+    total_duration = end_time - start_time
+    hours, remainder = divmod(total_duration, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    total_duration_str = f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
+    print(f"Total processing time: {total_duration_str}")
+    return total_duration_str, srt_content, json_content, output_srt, output_json
 # Define Gradio interface
 with gr.Blocks() as demo:
             submit_btn = gr.Button("Process Audio")
         with gr.Column():
+            information = gr.Text(
+                label="Information"
+            )
             srt_output = gr.Textbox(
                 label="SRT Output",
                 lines=10
             )
             # Add download buttons
             srt_download = gr.File(
+                label="Download SRT File",
+                type="binary"
             )
             json_download = gr.File(
+                label="Download JSON File",
+                type="binary"
             )
     submit_btn.click(
         fn=process_audio,
         inputs=[audio_input, model, task, language, hf_token],
+        outputs=[information, srt_output, json_output, srt_download, json_download]
     )
 if __name__ == "__main__":
         share=False,              # Disable temporary public URL
         show_error=True,          # Show detailed error messages
         debug=True               # Enable debug mode for development
+    )
+# TODO: Slow printing on the command.