Spaces:

StevenChen16
/

WhisperX-V2

Running on Zero

App Files Files Community

StevenChen16 commited on Nov 13, 2024

Commit

1e923d6

verified ·

1 Parent(s): 4036c8e

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -54

app.py CHANGED Viewed

@@ -6,63 +6,48 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import gc
 import os
-import time
 # Constants
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-BATCH_SIZE = 2  # Reduced batch size
-COMPUTE_TYPE = "int8"  # Changed to int8 for lower memory usage
-FILE_LIMIT_MB = 25  # Reduced file size limit
-def clean_gpu_memory():
-    """Helper function to clean GPU memory"""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        gc.collect()
-@spaces.GPU
 def transcribe_audio(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
-        # Check file size
-        file_size = os.path.getsize(inputs) / (1024 * 1024)  # Convert to MB
-        if file_size > FILE_LIMIT_MB:
-            raise gr.Error(f"File size ({file_size:.1f}MB) exceeds limit of {FILE_LIMIT_MB}MB")
-        # Load audio with error handling
-        try:
             audio = whisperx.load_audio(inputs)
-        except Exception as e:
-            raise gr.Error(f"Error loading audio file: {str(e)}")
         # 1. Transcribe with base Whisper model
-        try:
-            model = whisperx.load_model("large-v3", device=DEVICE, compute_type=COMPUTE_TYPE)
-            result = model.transcribe(audio, batch_size=BATCH_SIZE)
-        finally:
-            clean_gpu_memory()
-            if 'model' in locals():
-                del model
         # 2. Align whisper output
-        try:
-            model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
-            result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)
-        finally:
-            clean_gpu_memory()
-            if 'model_a' in locals():
-                del model_a
         # 3. Diarize audio
-        try:
-            diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.environ.get("HF_TOKEN"), device=DEVICE)
-            diarize_segments = diarize_model(audio)
-        finally:
-            if 'diarize_model' in locals():
-                del diarize_model
-            clean_gpu_memory()
         # 4. Assign speaker labels
         result = whisperx.assign_word_speakers(diarize_segments, result)
@@ -77,8 +62,12 @@ def transcribe_audio(inputs, task):
         return output_text
     except Exception as e:
-        clean_gpu_memory()
         raise gr.Error(f"Error processing audio: {str(e)}")
 # Create Gradio interface
 demo = gr.Blocks(theme=gr.themes.Ocean())
@@ -91,7 +80,7 @@ with demo:
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
-                label=f"Audio Input (Max {FILE_LIMIT_MB}MB)",
             )
             task = gr.Radio(
                 ["transcribe", "translate"],
@@ -107,17 +96,15 @@ with demo:
                 placeholder="Transcribed text will appear here..."
             )
-    gr.Markdown(f"""
     ### Features:
     - High-accuracy transcription using WhisperX
     - Automatic speaker diarization
     - Support for both microphone recording and file upload
-    - File size limit: {FILE_LIMIT_MB}MB
     ### Note:
-    - Processing may take a few moments
-    - For optimal results, use clear audio with minimal background noise
-    - If you encounter errors, try with a shorter audio clip
     """)
     submit_button.click(
@@ -126,9 +113,4 @@ with demo:
         outputs=output_text
     )
-demo.queue(max_size=1).launch(
-    share=False,
-    debug=True,
-    show_error=True,
-    ssr_mode=False
-)

 import tempfile
 import gc
 import os
 # Constants
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+BATCH_SIZE = 4  # reduce if low on GPU mem
+COMPUTE_TYPE = "float32"  # change to "int8" if low on GPU mem
+FILE_LIMIT_MB = 1000
+@spaces.GPU(duration=200)
 def transcribe_audio(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
+        # Load audio
+        if isinstance(inputs, str):
+            # For file path input
+            audio = whisperx.load_audio(inputs)
+        else:
+            # For microphone input (needs conversion)
             audio = whisperx.load_audio(inputs)
         # 1. Transcribe with base Whisper model
+        model = whisperx.load_model("large-v3", device=DEVICE, compute_type=COMPUTE_TYPE)
+        result = model.transcribe(audio, batch_size=BATCH_SIZE)
+        # Clear GPU memory
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
         # 2. Align whisper output
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
+        result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)
+        # Clear GPU memory again
+        del model_a
+        gc.collect()
+        torch.cuda.empty_cache()
         # 3. Diarize audio
+        diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.environ["HF_TOKEN"], device=DEVICE)
+        diarize_segments = diarize_model(audio)
         # 4. Assign speaker labels
         result = whisperx.assign_word_speakers(diarize_segments, result)
         return output_text
     except Exception as e:
         raise gr.Error(f"Error processing audio: {str(e)}")
+    finally:
+        # Final cleanup
+        gc.collect()
+        torch.cuda.empty_cache()
 # Create Gradio interface
 demo = gr.Blocks(theme=gr.themes.Ocean())
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
+                label="Audio Input (Microphone or File Upload)"
             )
             task = gr.Radio(
                 ["transcribe", "translate"],
                 placeholder="Transcribed text will appear here..."
             )
+    gr.Markdown("""
     ### Features:
     - High-accuracy transcription using WhisperX
     - Automatic speaker diarization
     - Support for both microphone recording and file upload
+    - GPU-accelerated processing
     ### Note:
+    Processing may take a few moments depending on the audio length and system resources.
     """)
     submit_button.click(
         outputs=output_text
     )
+demo.queue().launch(ssr_mode=False)