asr_pipeline / app.py
Manjot Singh
reduced spaces duration
15002e4
import gradio as gr
from audio_processing import process_audio, print_results
import torch
import spaces
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
print("No CUDA GPUs available. Running on CPU.")
def transcribe_audio(audio_file, translate, model_size):
language_segments, final_segments = process_audio(audio_file, translate=translate, model_size=model_size)
output = "Detected language changes:\n\n"
for segment in language_segments:
output += f"Language: {segment['language']}\n"
output += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"
output += f"Transcription with language detection and speaker diarization (using {model_size} model):\n\n"
for segment in final_segments:
output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:\n"
output += f"Original: {segment['text']}\n"
if translate:
output += f"Translated: {segment['translated']}\n"
output += "\n"
return output
iface = gr.Interface(
fn=transcribe_audio,
inputs=[
gr.Audio(type="filepath"),
gr.Checkbox(label="Enable Translation"),
gr.Dropdown(choices=["tiny", "base", "small", "medium", "large","large-v2","large-v3"], label="Whisper Model Size", value="small")
],
outputs="text",
title="WhisperX Audio Transcription and Translation"
)
iface.launch()