w2v-bert2-czech / app.py
mikr's picture
gr.Audio
1031935
raw
history blame
1.56 kB
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import librosa
from transformers import AutoProcessor, Wav2Vec2BertForCTC
import spaces
MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"
device = 0 if torch.cuda.is_available() else "cpu"
print("device:",device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = Wav2Vec2BertForCTC.from_pretrained(MODEL_NAME).to(device)
@spaces.GPU
def transcribe(audio_path):
a, s = librosa.load(audio_path, sr=16_000)
# inputs = processor(a, sampling_rate=s, return_tensors="pt")
input_values = processor(a, sampling_rate=s, return_tensors="pt").input_features
with torch.no_grad():
logits = model(input_values.to(device)).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe speech
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Upload Audio File"), # Audio file upload
],
outputs="text",
theme="huggingface",
title="Czech W2V-BERT 2.0 speech encoder demo - transcribe Czech Audio",
description=(
"Transcribe audio inputs with the click of a button! Demo uses the fine-tuned"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Facebook W2V-BERT 2.0 speech encoder "
"and 🤗 Transformers to transcribe audio files of arbitrary length."
),
allow_flagging="never",
)
iface.launch(server_name="0.0.0.0")