|
import whisper |
|
from whisper.transcribe import LANGUAGES, TO_LANGUAGE_CODE |
|
from whisper.utils import get_writer |
|
import torch |
|
import spaces |
|
import gradio as gr |
|
import pathlib |
|
import random |
|
from datetime import datetime |
|
|
|
APP_DIR = pathlib.Path(__file__).parent.absolute() |
|
|
|
LOCAL_DIR = APP_DIR / "files" |
|
LOCAL_DIR.mkdir(exist_ok=True) |
|
SAVE_DIR = LOCAL_DIR / "transcripts" |
|
SAVE_DIR.mkdir(exist_ok=True) |
|
LANGS = [lang.capitalize() for lang in list(LANGUAGES.values())] |
|
|
|
DEVICE = 'cuda' |
|
loaded_model = whisper.load_model("medium", "cpu") |
|
current_size = "None" |
|
|
|
def generate_random_filename(): |
|
now = datetime.now() |
|
timestamp = now.strftime("%H_%M_%S_%d_%m_%Y") |
|
random_suffix = ''.join(random.choices('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=8)) |
|
filename = f"transcript_{timestamp}_{random_suffix}" |
|
return filename |
|
|
|
@spaces.GPU(enable_queue=True) |
|
def get_transcript(audio_path, task_selection:str, language:str, output_format:str, max_line_width=0, max_line_count=0, max_words_per_line=0): |
|
writer = get_writer(output_format.lower(), SAVE_DIR) |
|
writer_args = { |
|
"max_line_width": max_line_width if max_line_width > 0 else None, |
|
"max_line_count": max_line_count if max_line_count > 0 else None, |
|
"max_words_per_line": max_words_per_line if max_words_per_line > 0 else None |
|
|
|
} |
|
|
|
options = dict(task=task_selection.lower(), best_of=5, language=TO_LANGUAGE_CODE[language.lower()]) |
|
loaded_model.to(DEVICE) |
|
results = loaded_model.transcribe(audio_path, verbose=True, word_timestamps=True, **options) |
|
filename = generate_random_filename() |
|
writer(results, filename, **writer_args) |
|
|
|
return str(SAVE_DIR / f"{filename}.{output_format.lower()}") |
|
|
|
title=""" |
|
<div style="text-align: center; max-width: 500px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
margin-bottom: 10px; |
|
" |
|
> |
|
<h1 style="font-weight: 600; margin-bottom: 7px;"> |
|
Auto Transcriber π |
|
</h1> |
|
</div> |
|
|
|
</div> |
|
""" |
|
|
|
with gr.Blocks() as monapp: |
|
with gr.Column(): |
|
gr.HTML(title) |
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(label="Audiofile to transcribe", sources=["upload"], type="filepath") |
|
with gr.Accordion("Transcribe options", open=True): |
|
task_selection = gr.Radio(["Transcribe", "Translate"], value="Transcribe", label="Select a Task") |
|
language = gr.Dropdown(choices=LANGS, value="English", label="Language spoken in the audio") |
|
output_format = gr.Radio(["TXT", "VTT", "SRT", "TSV", "JSON"], value="TXT", label="Format of the output file") |
|
with gr.Column(): |
|
gr.HTML("<p>keep at 0 to <strong>don't use</strong></p>\n<p>max_words_per_line has no effect with max_line_width activated\nWord-level timestamps on translations may not be reliable.</p>") |
|
max_line_width = gr.Number(label="Maximum number of characters in a line before breaking the line", minimum=0, precision=0, value=0, step=1) |
|
max_line_count = gr.Number(label="Maximum number of lines in a segment", minimum=0, precision=0, value=0, step=1) |
|
max_words_per_line = gr.Number(label="Maximum number of words in a segment", minimum=0, precision=0, value=0, step=1) |
|
submit_btn = gr.Button("Transcribe") |
|
|
|
with gr.Column(): |
|
transcript = gr.File(height=50) |
|
|
|
submit_btn.click(fn=get_transcript, inputs=[audio_input, task_selection, language, output_format, max_line_width, max_line_count, max_words_per_line], outputs=[transcript]) |
|
|
|
monapp.launch(debug=True, show_error=True) |
|
|
|
|