whisper-german / app.py
patrickvonplaten's picture
Update app.py
b55a61f
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers.utils import is_flash_attn_2_available
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
import gradio as gr
import time
BATCH_SIZE = 16
MAX_AUDIO_MINS = 30 # maximum audio input in minutes
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
use_flash_attention_2 = is_flash_attn_2_available()
model = AutoModelForSpeechSeq2Seq.from_pretrained(
"primeline/whisper-large-v3-german", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=False, use_flash_attention_2=use_flash_attention_2
)
if not use_flash_attention_2:
# use flash attention from pytorch sdpa
model = model.to_bettertransformer()
processor = AutoProcessor.from_pretrained("primeline/whisper-large-v3-german")
model.to(device)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
torch_dtype=torch_dtype,
device=device,
generate_kwargs={"language": "de", "task": "transcribe"},
return_timestamps=True
)
def transcribe(inputs):
if inputs is None:
raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
with open(inputs, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60
if audio_length_mins > MAX_AUDIO_MINS:
raise gr.Error(
f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
f"Got an audio of length {round(audio_length_mins, 3)} minutes."
)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
yield text
if __name__ == "__main__":
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
KI Spracherkennung: Kannst du schnell genug reden damit Whisper-German dich <u>nicht</u> versteht?
</h1>
</div>
</div>
"""
)
gr.HTML(
f"""
<p><a href="https://huggingface.co/primeline/whisper-large-v3-german"> Whisper-German</a> ist eines der besten Deutschen
Spracherkennungs Modelle die es gibt. Es basiert auf OpenAI's <a href="https://huggingface.co/openai/whisper-large-v3"> Whisper-v3</a> und wurde auf qualitativ
hochwertigen deutschen Audio Daten weiter trainert </p>
<p> Um zu demonstrieren wie <strong>gut</strong> das Model ist, laden wir dich ein zu versuchen es zu Fehlern zu zwingen. Rede so schnell wie du kannst, so unverstaendlich wie
du kannst oder benutze moglichst komplizierte Wörter um das Modelle dazu zu bringen falsche Transkriptionen zu generieren.
<strong> Diese Demo speichert keinerlei Daten von dir </strong>.
</p>
"""
)
audio = gr.components.Audio(type="filepath", label="Audio input", sources="microphone")
button = gr.Button("Transkribiere")
with gr.Row():
transcription = gr.components.Textbox(label="Whisper-German Transkription", show_copy_button=True)
button.click(
fn=transcribe,
inputs=audio,
outputs=[transcription],
)
demo.queue(max_size=10).launch()