File size: 4,439 Bytes
c569b48
a314490
b360956
aaac499
 
3a346c4
b360956
 
 
 
 
 
 
 
 
 
 
 
3a346c4
aaac499
 
 
9148c64
3a346c4
aaac499
6db9237
 
3a346c4
6db9237
3a346c4
6db9237
3a346c4
 
6db9237
 
 
b360956
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaac499
819f6d1
6db9237
b360956
f36e52e
819f6d1
aaac499
b360956
 
 
 
 
 
 
aaac499
 
819f6d1
b360956
6db9237
b360956
aaac499
 
 
 
b360956
aaac499
b360956
 
 
 
 
 
 
 
 
 
 
 
 
aaac499
b360956
 
 
 
 
aaac499
 
3a346c4
 
 
 
 
aaac499
 
3a346c4
aaac499
 
f36e52e
81e4ee2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
from audio_processing import process_audio, load_models
from transformers import pipeline
import spaces
import torch
import logging
import traceback
import sys

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler('app.log')
    ]
)
logger = logging.getLogger(__name__)

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
device = "cuda" if cuda_available else "cpu"
logger.info(f"Using device: {device}")

# Load Whisper model
print("Loading Whisper model...")
try:
    load_models()  # Load Whisper model
except Exception as e:
    logger.error(f"Error loading Whisper model: {str(e)}")
    raise

print("Whisper model loaded successfully.")

def load_summarization_model():
    logger.info("Loading summarization model...")
    try:
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=0 if cuda_available else -1)
    except Exception as e:
        logger.warning(f"Failed to load summarization model on GPU. Falling back to CPU. Error: {str(e)}")
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)
    logger.info("Summarization model loaded.")
    return summarizer

def process_with_fallback(func, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        logger.error(f"Error during processing: {str(e)}")
        logger.error(traceback.format_exc())
        if "CUDA" in str(e) or "GPU" in str(e):
            logger.info("Falling back to CPU processing...")
            # Modify kwargs to force CPU processing
            kwargs['use_gpu'] = False
            return func(*args, **kwargs)
        else:
            raise

@spaces.GPU(duration=600)
def transcribe_audio(audio_file, translate, model_size, use_diarization):
    return process_with_fallback(process_audio, audio_file, translate=translate, model_size=model_size, use_diarization=use_diarization)

@spaces.GPU(duration=600)
def summarize_text(text):
    summarizer = load_summarization_model()
    try:
        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
    except Exception as e:
        logger.error(f"Error during summarization: {str(e)}")
        logger.error(traceback.format_exc())
        summary = "Error occurred during summarization. Please try again."
    return summary

@spaces.GPU(duration=600)
def process_and_summarize(audio_file, translate, model_size, use_diarization, do_summarize):
    transcription, full_text = transcribe_audio(audio_file, translate, model_size, use_diarization)
    summary = summarize_text(full_text) if do_summarize else ""
    return transcription, summary

# Main interface
with gr.Blocks() as iface:
    gr.Markdown("# WhisperX Audio Transcription, Translation, and Summarization (with ZeroGPU support)")
    
    audio_input = gr.Audio(type="filepath")
    translate_checkbox = gr.Checkbox(label="Enable Translation")
    summarize_checkbox = gr.Checkbox(label="Enable Summarization", interactive=False)
    model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small")
    diarization_checkbox = gr.Checkbox(label="Enable Speaker Diarization")
    process_button = gr.Button("Process Audio")
    transcription_output = gr.Textbox(label="Transcription/Translation")
    summary_output = gr.Textbox(label="Summary")

    def update_summarize_checkbox(translate):
        return gr.Checkbox(interactive=translate)

    translate_checkbox.change(update_summarize_checkbox, inputs=[translate_checkbox], outputs=[summarize_checkbox])
    
    process_button.click(
        process_and_summarize,
        inputs=[audio_input, translate_checkbox, model_dropdown, diarization_checkbox, summarize_checkbox],
        outputs=[transcription_output, summary_output]
    )

    gr.Markdown(
        f"""
        ## System Information
        - Device: {device}
        - CUDA Available: {"Yes" if cuda_available else "No"}
        
        ## ZeroGPU Support
        This application supports ZeroGPU for Hugging Face Spaces pro users. 
        GPU-intensive tasks are automatically optimized for better performance when available.
        """
    )

iface.launch()