File size: 4,064 Bytes
7091430
 
172ec24
7091430
 
 
 
 
172ec24
7091430
 
 
 
 
 
81a9d24
7091430
 
 
172ec24
2ad1599
7091430
c632ef6
7091430
 
 
 
 
 
 
 
 
 
 
 
81a9d24
985c6bd
7091430
 
 
 
 
 
172ec24
 
 
 
 
 
 
 
 
 
 
 
 
 
80ca55c
7091430
c632ef6
7091430
 
 
 
 
 
 
 
 
 
 
 
b55a61f
7091430
 
 
 
 
 
0c821a6
e669559
0c821a6
2434d04
0c821a6
2434d04
 
3318cc5
 
7091430
 
3318cc5
e669559
7091430
e669559
3318cc5
7091430
 
 
c632ef6
8bda11a
172ec24
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers.utils import is_flash_attn_2_available
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
import gradio as gr
import time

BATCH_SIZE = 16
MAX_AUDIO_MINS = 30  # maximum audio input in minutes

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
use_flash_attention_2 = is_flash_attn_2_available()

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "primeline/whisper-large-v3-german", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=False, use_flash_attention_2=use_flash_attention_2
)

if not use_flash_attention_2:
    # use flash attention from pytorch sdpa
    model = model.to_bettertransformer()

processor = AutoProcessor.from_pretrained("primeline/whisper-large-v3-german")

model.to(device)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "de", "task": "transcribe"},
    return_timestamps=True
)

def transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")

    with open(inputs, "rb") as f:
        inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60

    if audio_length_mins > MAX_AUDIO_MINS:
        raise gr.Error(
            f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
            f"Got an audio of length {round(audio_length_mins, 3)} minutes."
        )

    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

    text = pipe(inputs, batch_size=BATCH_SIZE)["text"]

    yield text

if __name__ == "__main__":
    with gr.Blocks() as demo:
        gr.HTML(
            """
                <div style="text-align: center; max-width: 700px; margin: 0 auto;">
                  <div
                    style="
                      display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
                    "
                  >
                    <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
                      KI Spracherkennung: Kannst du schnell genug reden damit Whisper-German dich <u>nicht</u> versteht?
                    </h1>
                  </div>
                </div>
            """
        )
        gr.HTML(
            f"""
            <p><a href="https://huggingface.co/primeline/whisper-large-v3-german"> Whisper-German</a> ist eines der besten Deutschen 
            Spracherkennungs Modelle die es gibt. Es basiert auf OpenAI's <a href="https://huggingface.co/openai/whisper-large-v3"> Whisper-v3</a> und wurde auf qualitativ
            hochwertigen deutschen Audio Daten weiter trainert </p>
            
            <p> Um zu demonstrieren wie <strong>gut</strong> das Model ist, laden wir dich ein zu versuchen es zu Fehlern zu zwingen. Rede so schnell wie du kannst, so unverstaendlich wie 
            du kannst oder benutze moglichst komplizierte Wörter um das Modelle dazu zu bringen falsche Transkriptionen zu generieren.
            <strong> Diese Demo speichert keinerlei Daten von dir </strong>.
            </p>
            """
        )
        audio = gr.components.Audio(type="filepath", label="Audio input", sources="microphone")
        button = gr.Button("Transkribiere")
        with gr.Row():
            transcription = gr.components.Textbox(label="Whisper-German Transkription", show_copy_button=True)
            
        button.click(
            fn=transcribe,
            inputs=audio,
            outputs=[transcription],
        )
    demo.queue(max_size=10).launch()