import whisper from whisper.transcribe import LANGUAGES, TO_LANGUAGE_CODE from whisper.utils import get_writer import torch import gradio as gr import pathlib import random from datetime import datetime APP_DIR = pathlib.Path(__file__).parent.absolute() LOCAL_DIR = APP_DIR / "files" LOCAL_DIR.mkdir(exist_ok=True) SAVE_DIR = LOCAL_DIR / "transcripts" SAVE_DIR.mkdir(exist_ok=True) LANGS = [lang.capitalize() for lang in list(LANGUAGES.values())] DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' loaded_model = whisper.load_model("medium", "cpu") current_size = "None" def generate_random_filename(): now = datetime.now() timestamp = now.strftime("%H_%M_%S_%d_%m_%Y") random_suffix = ''.join(random.choices('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=8)) filename = f"transcript_{timestamp}_{random_suffix}" return filename @spaces.GPU def get_transcript(audio_path, task_selection:str, language:str, max_line_width=0 , max_line_count=0, max_words_per_line=0): output_format = "all" writer = get_writer(output_format, SAVE_DIR) writer_args = { "max_line_width": max_line_width if max_line_width > 0 else None, "max_line_count": max_line_count if max_line_count > 0 else None, "max_words_per_line": max_words_per_line if max_words_per_line > 0 else None } options = dict(task=task_selection.lower(), best_of=5, language=TO_LANGUAGE_CODE[language.lower()]) loaded_model.to(DEVICE) results = loaded_model.transcribe(audio_path, verbose=True, word_timestamps=True, **options) # sample_rate, audio = audiodata # results = loaded_model.transcribe(audio, verbose=True, word_timestamps=True, **options) filename = generate_random_filename() writer(results, filename, **writer_args) return str(SAVE_DIR / f"{filename}.txt"), str(SAVE_DIR / f"{filename}.srt"), str(SAVE_DIR / f"{filename}.vtt") title="""

Auto Transcriber 🔊

""" with gr.Blocks() as monapp: with gr.Column(): gr.HTML(title) with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Audiofile to transcribe", sources=["upload"], type="filepath") with gr.Accordion("Transcribe options", open=True): task_selection = gr.Radio(["Transcribe", "Translate"], value="Transcribe", label="Select a Task") language = gr.Dropdown(choices=LANGS, value="English", label="Language spoken in the audio") with gr.Column(): gr.HTML("

keep at 0 to don't use

\n

max_words_per_line has no effect with max_line_width activated\nWord-level timestamps on translations may not be reliable.

") # gr.HTML("

max_words_per_line has no effect with max_line_width activated

") max_line_width = gr.Number(label="Maximum number of characters in a line before breaking the line", minimum=0, precision=0, value=0, step=1) max_line_count = gr.Number(label="Maximum number of lines in a segment", minimum=0, precision=0, value=0, step=1) max_words_per_line = gr.Number(label="Maximum number of words in a segment", minimum=0, precision=0, value=0, step=1) # with gr.Group(): # active_img_bg= gr.Checkbox(False, label="Enable Background image") # img_bg = gr.Textbox(None, label="Background image", placeholder="Background image path", show_label=False) submit_btn = gr.Button("Transcribe") with gr.Column(): transcript_txt = gr.File(height=50) transcript_srt = gr.File(height=50) transcript_vtt = gr.File(height=50) submit_btn.click(fn=get_transcript, inputs=[audio_input, task_selection, language, max_line_width, max_line_count, max_words_per_line], outputs=[transcript_txt, transcript_srt, transcript_vtt]) monapp.launch(debug=True, show_error=True)