Spaces:

StevenChen16
/

WhisperX-V2

Running on Zero

App Files Files Community

StevenChen16 commited on Nov 13, 2024

Commit

4036c8e

verified ·

1 Parent(s): aa547ad

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -35

app.py CHANGED Viewed

@@ -6,12 +6,19 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import gc
 import os
 # Constants
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-BATCH_SIZE = 4  # reduce if low on GPU mem
-COMPUTE_TYPE = "float32"  # change to "int8" if low on GPU mem
-FILE_LIMIT_MB = 1000
 @spaces.GPU
 def transcribe_audio(inputs, task):
@@ -19,35 +26,43 @@ def transcribe_audio(inputs, task):
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
-        # Load audio
-        if isinstance(inputs, str):
-            # For file path input
-            audio = whisperx.load_audio(inputs)
-        else:
-            # For microphone input (needs conversion)
             audio = whisperx.load_audio(inputs)
         # 1. Transcribe with base Whisper model
-        model = whisperx.load_model("large-v3", device=DEVICE, compute_type=COMPUTE_TYPE)
-        result = model.transcribe(audio, batch_size=BATCH_SIZE)
-        # Clear GPU memory
-        del model
-        gc.collect()
-        torch.cuda.empty_cache()
         # 2. Align whisper output
-        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
-        result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)
-        # Clear GPU memory again
-        del model_a
-        gc.collect()
-        torch.cuda.empty_cache()
         # 3. Diarize audio
-        diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.environ["HF_TOKEN"], device=DEVICE)
-        diarize_segments = diarize_model(audio)
         # 4. Assign speaker labels
         result = whisperx.assign_word_speakers(diarize_segments, result)
@@ -62,12 +77,8 @@ def transcribe_audio(inputs, task):
         return output_text
     except Exception as e:
         raise gr.Error(f"Error processing audio: {str(e)}")
-    finally:
-        # Final cleanup
-        gc.collect()
-        torch.cuda.empty_cache()
 # Create Gradio interface
 demo = gr.Blocks(theme=gr.themes.Ocean())
@@ -80,7 +91,7 @@ with demo:
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
-                label="Audio Input (Microphone or File Upload)"
             )
             task = gr.Radio(
                 ["transcribe", "translate"],
@@ -96,15 +107,17 @@ with demo:
                 placeholder="Transcribed text will appear here..."
             )
-    gr.Markdown("""
     ### Features:
     - High-accuracy transcription using WhisperX
     - Automatic speaker diarization
     - Support for both microphone recording and file upload
-    - GPU-accelerated processing
     ### Note:
-    Processing may take a few moments depending on the audio length and system resources.
     """)
     submit_button.click(
@@ -113,4 +126,9 @@ with demo:
         outputs=output_text
     )
-demo.queue().launch(ssr_mode=False)

 import tempfile
 import gc
 import os
+import time
 # Constants
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+BATCH_SIZE = 2  # Reduced batch size
+COMPUTE_TYPE = "int8"  # Changed to int8 for lower memory usage
+FILE_LIMIT_MB = 25  # Reduced file size limit
+def clean_gpu_memory():
+    """Helper function to clean GPU memory"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
 @spaces.GPU
 def transcribe_audio(inputs, task):
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
+        # Check file size
+        file_size = os.path.getsize(inputs) / (1024 * 1024)  # Convert to MB
+        if file_size > FILE_LIMIT_MB:
+            raise gr.Error(f"File size ({file_size:.1f}MB) exceeds limit of {FILE_LIMIT_MB}MB")
+        # Load audio with error handling
+        try:
             audio = whisperx.load_audio(inputs)
+        except Exception as e:
+            raise gr.Error(f"Error loading audio file: {str(e)}")
         # 1. Transcribe with base Whisper model
+        try:
+            model = whisperx.load_model("large-v3", device=DEVICE, compute_type=COMPUTE_TYPE)
+            result = model.transcribe(audio, batch_size=BATCH_SIZE)
+        finally:
+            clean_gpu_memory()
+            if 'model' in locals():
+                del model
         # 2. Align whisper output
+        try:
+            model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
+            result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)
+        finally:
+            clean_gpu_memory()
+            if 'model_a' in locals():
+                del model_a
         # 3. Diarize audio
+        try:
+            diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.environ.get("HF_TOKEN"), device=DEVICE)
+            diarize_segments = diarize_model(audio)
+        finally:
+            if 'diarize_model' in locals():
+                del diarize_model
+            clean_gpu_memory()
         # 4. Assign speaker labels
         result = whisperx.assign_word_speakers(diarize_segments, result)
         return output_text
     except Exception as e:
+        clean_gpu_memory()
         raise gr.Error(f"Error processing audio: {str(e)}")
 # Create Gradio interface
 demo = gr.Blocks(theme=gr.themes.Ocean())
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
+                label=f"Audio Input (Max {FILE_LIMIT_MB}MB)",
             )
             task = gr.Radio(
                 ["transcribe", "translate"],
                 placeholder="Transcribed text will appear here..."
             )
+    gr.Markdown(f"""
     ### Features:
     - High-accuracy transcription using WhisperX
     - Automatic speaker diarization
     - Support for both microphone recording and file upload
+    - File size limit: {FILE_LIMIT_MB}MB
     ### Note:
+    - Processing may take a few moments
+    - For optimal results, use clear audio with minimal background noise
+    - If you encounter errors, try with a shorter audio clip
     """)
     submit_button.click(
         outputs=output_text
     )
+demo.queue(max_size=1).launch(
+    share=False,
+    debug=True,
+    show_error=True,
+    ssr_mode=False
+)