File size: 5,998 Bytes
c569b48
444b9c9
b360956
aaac499
 
3a346c4
b360956
 
 
 
 
 
 
444b9c9
b360956
 
3a346c4
aaac499
6db9237
b360956
 
444b9c9
b360956
444b9c9
 
b360956
 
 
444b9c9
 
b360956
 
 
 
 
 
 
 
 
 
 
 
 
aaac499
8055777
6db9237
444b9c9
 
 
 
 
 
 
 
f36e52e
8055777
aaac499
444b9c9
b360956
444b9c9
b360956
444b9c9
 
b360956
444b9c9
b360956
444b9c9
aaac499
8055777
b360956
444b9c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaac499
 
 
b360956
8055777
b360956
 
 
 
8055777
b360956
 
 
 
 
 
 
 
aaac499
b360956
 
 
 
 
aaac499
 
3a346c4
 
444b9c9
 
3a346c4
aaac499
 
3a346c4
aaac499
 
f36e52e
81e4ee2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
from audio_processing import process_audio
from transformers import pipeline
import spaces
import torch
import logging
import traceback
import sys

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

def load_summarization_model():
    logger.info("Loading summarization model...")
    try:
        cuda_available = torch.cuda.is_available()
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=0 if cuda_available else -1)
        logger.info(f"Summarization model loaded successfully on {'GPU' if cuda_available else 'CPU'}")
        return summarizer
    except Exception as e:
        logger.warning(f"Failed to load summarization model on GPU. Falling back to CPU. Error: {str(e)}")
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)
        logger.info("Summarization model loaded successfully on CPU")
        return summarizer

def process_with_fallback(func, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        logger.error(f"Error during processing: {str(e)}")
        logger.error(traceback.format_exc())
        if "CUDA" in str(e) or "GPU" in str(e):
            logger.info("Falling back to CPU processing...")
            kwargs['use_gpu'] = False
            return func(*args, **kwargs)
        else:
            raise

@spaces.GPU(duration=60)
def transcribe_audio(audio_file, translate, model_size, use_diarization):
    logger.info(f"Starting transcription: translate={translate}, model_size={model_size}, use_diarization={use_diarization}")
    try:
        result = process_with_fallback(process_audio, audio_file, translate=translate, model_size=model_size, use_diarization=use_diarization)
        logger.info("Transcription completed successfully")
        return result
    except Exception as e:
        logger.error(f"Transcription failed: {str(e)}")
        raise gr.Error(f"Transcription failed: {str(e)}")

@spaces.GPU(duration=60)
def summarize_text(text):
    logger.info("Starting text summarization")
    try:
        summarizer = load_summarization_model()
        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        logger.info("Summarization completed successfully")
        return summary
    except Exception as e:
        logger.error(f"Summarization failed: {str(e)}")
        logger.error(traceback.format_exc())
        return "Error occurred during summarization. Please try again."

@spaces.GPU(duration=60)
def process_and_summarize(audio_file, translate, model_size, use_diarization, do_summarize):
    logger.info(f"Starting process_and_summarize: translate={translate}, model_size={model_size}, use_diarization={use_diarization}, do_summarize={do_summarize}")
    try:
        language_segments, final_segments = transcribe_audio(audio_file, translate, model_size, use_diarization)
        
        transcription = "Detected language changes:\n\n"
        for segment in language_segments:
            transcription += f"Language: {segment['language']}\n"
            transcription += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"

        transcription += f"Transcription with language detection and speaker diarization (using {model_size} model):\n\n"
        full_text = ""
        for segment in final_segments:
            transcription += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:\n"
            transcription += f"Original: {segment['text']}\n"
            if translate:
                transcription += f"Translated: {segment['translated']}\n"
                full_text += segment['translated'] + " "
            else:
                full_text += segment['text'] + " "
            transcription += "\n"

        summary = summarize_text(full_text) if do_summarize else ""
        logger.info("Process and summarize completed successfully")
        return transcription, summary
    except Exception as e:
        logger.error(f"Process and summarize failed: {str(e)}")
        logger.error(traceback.format_exc())
        raise gr.Error(f"Processing failed: {str(e)}")

# Main interface
with gr.Blocks() as iface:
    gr.Markdown("# WhisperX Audio Transcription, Translation, and Summarization (with ZeroGPU support)")

    audio_input = gr.Audio(type="filepath")
    translate_checkbox = gr.Checkbox(label="Enable Translation")
    summarize_checkbox = gr.Checkbox(label="Enable Summarization", interactive=False)
    diarization_checkbox = gr.Checkbox(label="Enable Speaker Diarization")
    model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small")
    process_button = gr.Button("Process Audio")
    transcription_output = gr.Textbox(label="Transcription/Translation")
    summary_output = gr.Textbox(label="Summary")

    def update_summarize_checkbox(translate):
        return gr.Checkbox(interactive=translate)

    translate_checkbox.change(update_summarize_checkbox, inputs=[translate_checkbox], outputs=[summarize_checkbox])
    
    process_button.click(
        process_and_summarize,
        inputs=[audio_input, translate_checkbox, model_dropdown, diarization_checkbox, summarize_checkbox],
        outputs=[transcription_output, summary_output]
    )

    gr.Markdown(
        f"""
        ## System Information
        - Device: {"CUDA" if torch.cuda.is_available() else "CPU"}
        - CUDA Available: {"Yes" if torch.cuda.is_available() else "No"}
        
        ## ZeroGPU Support
        This application supports ZeroGPU for Hugging Face Spaces pro users. 
        GPU-intensive tasks are automatically optimized for better performance when available.
        """
    )

iface.launch()