File size: 3,944 Bytes
2e802f8 c6ed614 2e802f8 8b01502 2c04385 2e802f8 abc87ba 2e802f8 3c5db57 abc87ba 8b01502 2e802f8 f2a5d21 2e802f8 5ffc16c 2e802f8 8b01502 2e802f8 8b01502 2e802f8 8b01502 2e802f8 8b01502 2e802f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import whisper
from whisper.transcribe import LANGUAGES, TO_LANGUAGE_CODE
from whisper.utils import get_writer
import torch
import spaces
import gradio as gr
import pathlib
import random
from datetime import datetime
APP_DIR = pathlib.Path(__file__).parent.absolute()
LOCAL_DIR = APP_DIR / "files"
LOCAL_DIR.mkdir(exist_ok=True)
SAVE_DIR = LOCAL_DIR / "transcripts"
SAVE_DIR.mkdir(exist_ok=True)
LANGS = [lang.capitalize() for lang in list(LANGUAGES.values())]
# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE = 'cuda'
loaded_model = whisper.load_model("medium", "cpu")
current_size = "None"
def generate_random_filename():
now = datetime.now()
timestamp = now.strftime("%H_%M_%S_%d_%m_%Y")
random_suffix = ''.join(random.choices('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=8))
filename = f"transcript_{timestamp}_{random_suffix}"
return filename
@spaces.GPU(enable_queue=True)
def get_transcript(audio_path, task_selection:str, language:str, output_format:str, max_line_width=0, max_line_count=0, max_words_per_line=0):
writer = get_writer(output_format.lower(), SAVE_DIR)
writer_args = {
"max_line_width": max_line_width if max_line_width > 0 else None,
"max_line_count": max_line_count if max_line_count > 0 else None,
"max_words_per_line": max_words_per_line if max_words_per_line > 0 else None
}
options = dict(task=task_selection.lower(), best_of=5, language=TO_LANGUAGE_CODE[language.lower()])
loaded_model.to(DEVICE)
results = loaded_model.transcribe(audio_path, verbose=True, word_timestamps=True, **options)
filename = generate_random_filename()
writer(results, filename, **writer_args)
return str(SAVE_DIR / f"{filename}.{output_format.lower()}")
title="""
<div style="text-align: center; max-width: 500px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
margin-bottom: 10px;
"
>
<h1 style="font-weight: 600; margin-bottom: 7px;">
Auto Transcriber π
</h1>
</div>
</div>
"""
with gr.Blocks() as monapp:
with gr.Column():
gr.HTML(title)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="Audiofile to transcribe", sources=["upload"], type="filepath")
with gr.Accordion("Transcribe options", open=True):
task_selection = gr.Radio(["Transcribe", "Translate"], value="Transcribe", label="Select a Task")
language = gr.Dropdown(choices=LANGS, value="English", label="Language spoken in the audio")
output_format = gr.Radio(["TXT", "VTT", "SRT", "TSV", "JSON"], value="TXT", label="Format of the output file")
with gr.Column():
gr.HTML("<p>keep at 0 to <strong>don't use</strong></p>\n<p>max_words_per_line has no effect with max_line_width activated\nWord-level timestamps on translations may not be reliable.</p>")
max_line_width = gr.Number(label="Maximum number of characters in a line before breaking the line", minimum=0, precision=0, value=0, step=1)
max_line_count = gr.Number(label="Maximum number of lines in a segment", minimum=0, precision=0, value=0, step=1)
max_words_per_line = gr.Number(label="Maximum number of words in a segment", minimum=0, precision=0, value=0, step=1)
submit_btn = gr.Button("Transcribe")
with gr.Column():
transcript = gr.File(height=50)
submit_btn.click(fn=get_transcript, inputs=[audio_input, task_selection, language, output_format, max_line_width, max_line_count, max_words_per_line], outputs=[transcript])
monapp.launch(debug=True, show_error=True)
|