Spaces:
Runtime error
Runtime error
File size: 4,064 Bytes
7091430 172ec24 7091430 172ec24 7091430 81a9d24 7091430 172ec24 2ad1599 7091430 c632ef6 7091430 81a9d24 985c6bd 7091430 172ec24 80ca55c 7091430 c632ef6 7091430 b55a61f 7091430 0c821a6 e669559 0c821a6 2434d04 0c821a6 2434d04 3318cc5 7091430 3318cc5 e669559 7091430 e669559 3318cc5 7091430 c632ef6 8bda11a 172ec24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers.utils import is_flash_attn_2_available
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
import gradio as gr
import time
BATCH_SIZE = 16
MAX_AUDIO_MINS = 30 # maximum audio input in minutes
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
use_flash_attention_2 = is_flash_attn_2_available()
model = AutoModelForSpeechSeq2Seq.from_pretrained(
"primeline/whisper-large-v3-german", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=False, use_flash_attention_2=use_flash_attention_2
)
if not use_flash_attention_2:
# use flash attention from pytorch sdpa
model = model.to_bettertransformer()
processor = AutoProcessor.from_pretrained("primeline/whisper-large-v3-german")
model.to(device)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
torch_dtype=torch_dtype,
device=device,
generate_kwargs={"language": "de", "task": "transcribe"},
return_timestamps=True
)
def transcribe(inputs):
if inputs is None:
raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
with open(inputs, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60
if audio_length_mins > MAX_AUDIO_MINS:
raise gr.Error(
f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
f"Got an audio of length {round(audio_length_mins, 3)} minutes."
)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
yield text
if __name__ == "__main__":
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
KI Spracherkennung: Kannst du schnell genug reden damit Whisper-German dich <u>nicht</u> versteht?
</h1>
</div>
</div>
"""
)
gr.HTML(
f"""
<p><a href="https://huggingface.co/primeline/whisper-large-v3-german"> Whisper-German</a> ist eines der besten Deutschen
Spracherkennungs Modelle die es gibt. Es basiert auf OpenAI's <a href="https://huggingface.co/openai/whisper-large-v3"> Whisper-v3</a> und wurde auf qualitativ
hochwertigen deutschen Audio Daten weiter trainert </p>
<p> Um zu demonstrieren wie <strong>gut</strong> das Model ist, laden wir dich ein zu versuchen es zu Fehlern zu zwingen. Rede so schnell wie du kannst, so unverstaendlich wie
du kannst oder benutze moglichst komplizierte Wörter um das Modelle dazu zu bringen falsche Transkriptionen zu generieren.
<strong> Diese Demo speichert keinerlei Daten von dir </strong>.
</p>
"""
)
audio = gr.components.Audio(type="filepath", label="Audio input", sources="microphone")
button = gr.Button("Transkribiere")
with gr.Row():
transcription = gr.components.Textbox(label="Whisper-German Transkription", show_copy_button=True)
button.click(
fn=transcribe,
inputs=audio,
outputs=[transcription],
)
demo.queue(max_size=10).launch()
|