whisper-german

Runtime error

App Files Files Community

whisper-german / app.py

patrickvonplaten

Update app.py

b55a61f 8 months ago

raw history blame contribute delete

No virus

4.06 kB

	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from transformers.utils import is_flash_attn_2_available
	from transformers.pipelines.audio_utils import ffmpeg_read
	import torch
	import gradio as gr
	import time

	BATCH_SIZE = 16
	MAX_AUDIO_MINS = 30 # maximum audio input in minutes

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	use_flash_attention_2 = is_flash_attn_2_available()

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	"primeline/whisper-large-v3-german", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=False, use_flash_attention_2=use_flash_attention_2
	)

	if not use_flash_attention_2:
	# use flash attention from pytorch sdpa
	model = model.to_bettertransformer()

	processor = AutoProcessor.from_pretrained("primeline/whisper-large-v3-german")

	model.to(device)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs={"language": "de", "task": "transcribe"},
	return_timestamps=True
	)

	def transcribe(inputs):
	if inputs is None:
	raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")

	with open(inputs, "rb") as f:
	inputs = f.read()

	inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
	audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60

	if audio_length_mins > MAX_AUDIO_MINS:
	raise gr.Error(
	f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
	f"Got an audio of length {round(audio_length_mins, 3)} minutes."
	)

	inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

	text = pipe(inputs, batch_size=BATCH_SIZE)["text"]

	yield text

	if __name__ == "__main__":
	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="
	display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
	"
	>
	<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
	KI Spracherkennung: Kannst du schnell genug reden damit Whisper-German dich <u>nicht</u> versteht?
	</h1>
	</div>
	</div>
	"""
	)
	gr.HTML(
	f"""
	<p><a href="https://huggingface.co/primeline/whisper-large-v3-german"> Whisper-German</a> ist eines der besten Deutschen
	Spracherkennungs Modelle die es gibt. Es basiert auf OpenAI's <a href="https://huggingface.co/openai/whisper-large-v3"> Whisper-v3</a> und wurde auf qualitativ
	hochwertigen deutschen Audio Daten weiter trainert </p>

	<p> Um zu demonstrieren wie <strong>gut</strong> das Model ist, laden wir dich ein zu versuchen es zu Fehlern zu zwingen. Rede so schnell wie du kannst, so unverstaendlich wie
	du kannst oder benutze moglichst komplizierte Wörter um das Modelle dazu zu bringen falsche Transkriptionen zu generieren.
	<strong> Diese Demo speichert keinerlei Daten von dir </strong>.
	</p>
	"""
	)
	audio = gr.components.Audio(type="filepath", label="Audio input", sources="microphone")
	button = gr.Button("Transkribiere")
	with gr.Row():
	transcription = gr.components.Textbox(label="Whisper-German Transkription", show_copy_button=True)

	button.click(
	fn=transcribe,
	inputs=audio,
	outputs=[transcription],
	)
	demo.queue(max_size=10).launch()