Spaces:
Sleeping
Sleeping
File size: 1,561 Bytes
44f2969 7df6e8c 44f2969 1808ded f0eb938 44f2969 e473647 1808ded e473647 1808ded 44f2969 f0eb938 1808ded 44f2969 1808ded 7df6e8c 1808ded 7df6e8c 1808ded e473647 44f2969 1808ded 44f2969 1031935 44f2969 1031935 44f2969 1808ded 1031935 44f2969 1808ded |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import librosa
from transformers import AutoProcessor, Wav2Vec2BertForCTC
import spaces
MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"
device = 0 if torch.cuda.is_available() else "cpu"
print("device:",device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = Wav2Vec2BertForCTC.from_pretrained(MODEL_NAME).to(device)
@spaces.GPU
def transcribe(audio_path):
a, s = librosa.load(audio_path, sr=16_000)
# inputs = processor(a, sampling_rate=s, return_tensors="pt")
input_values = processor(a, sampling_rate=s, return_tensors="pt").input_features
with torch.no_grad():
logits = model(input_values.to(device)).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe speech
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Upload Audio File"), # Audio file upload
],
outputs="text",
theme="huggingface",
title="Czech W2V-BERT 2.0 speech encoder demo - transcribe Czech Audio",
description=(
"Transcribe audio inputs with the click of a button! Demo uses the fine-tuned"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Facebook W2V-BERT 2.0 speech encoder "
"and 🤗 Transformers to transcribe audio files of arbitrary length."
),
allow_flagging="never",
)
iface.launch(server_name="0.0.0.0")
|