import gradio as gr from audio_processing import process_audio, load_models from transformers import pipeline import spaces import torch import logging import traceback import sys # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('app.log') ] ) logger = logging.getLogger(__name__) # Check if CUDA is available cuda_available = torch.cuda.is_available() device = "cuda" if cuda_available else "cpu" logger.info(f"Using device: {device}") # Load Whisper model print("Loading Whisper model...") try: load_models() # Load Whisper model except Exception as e: logger.error(f"Error loading Whisper model: {str(e)}") raise print("Whisper model loaded successfully.") def load_summarization_model(): logger.info("Loading summarization model...") try: summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=0 if cuda_available else -1) except Exception as e: logger.warning(f"Failed to load summarization model on GPU. Falling back to CPU. Error: {str(e)}") summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1) logger.info("Summarization model loaded.") return summarizer def process_with_fallback(func, *args, **kwargs): try: return func(*args, **kwargs) except Exception as e: logger.error(f"Error during processing: {str(e)}") logger.error(traceback.format_exc()) if "CUDA" in str(e) or "GPU" in str(e): logger.info("Falling back to CPU processing...") # Modify kwargs to force CPU processing kwargs['use_gpu'] = False return func(*args, **kwargs) else: raise @spaces.GPU(duration=600) def transcribe_audio(audio_file, translate, model_size, use_diarization): return process_with_fallback(process_audio, audio_file, translate=translate, model_size=model_size, use_diarization=use_diarization) @spaces.GPU(duration=600) def summarize_text(text): summarizer = load_summarization_model() try: summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] except Exception as e: logger.error(f"Error during summarization: {str(e)}") logger.error(traceback.format_exc()) summary = "Error occurred during summarization. Please try again." return summary @spaces.GPU(duration=600) def process_and_summarize(audio_file, translate, model_size, use_diarization, do_summarize): transcription, full_text = transcribe_audio(audio_file, translate, model_size, use_diarization) summary = summarize_text(full_text) if do_summarize else "" return transcription, summary # Main interface with gr.Blocks() as iface: gr.Markdown("# WhisperX Audio Transcription, Translation, and Summarization (with ZeroGPU support)") audio_input = gr.Audio(type="filepath") translate_checkbox = gr.Checkbox(label="Enable Translation") summarize_checkbox = gr.Checkbox(label="Enable Summarization", interactive=False) model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small") diarization_checkbox = gr.Checkbox(label="Enable Speaker Diarization") process_button = gr.Button("Process Audio") transcription_output = gr.Textbox(label="Transcription/Translation") summary_output = gr.Textbox(label="Summary") def update_summarize_checkbox(translate): return gr.Checkbox(interactive=translate) translate_checkbox.change(update_summarize_checkbox, inputs=[translate_checkbox], outputs=[summarize_checkbox]) process_button.click( process_and_summarize, inputs=[audio_input, translate_checkbox, model_dropdown, diarization_checkbox, summarize_checkbox], outputs=[transcription_output, summary_output] ) gr.Markdown( f""" ## System Information - Device: {device} - CUDA Available: {"Yes" if cuda_available else "No"} ## ZeroGPU Support This application supports ZeroGPU for Hugging Face Spaces pro users. GPU-intensive tasks are automatically optimized for better performance when available. """ ) iface.launch()