File size: 1,561 Bytes
44f2969
7df6e8c
44f2969
1808ded
 
 
f0eb938
44f2969
 
 
 
e473647
1808ded
e473647
1808ded
 
44f2969
 
f0eb938
1808ded
 
44f2969
1808ded
 
7df6e8c
1808ded
 
 
 
7df6e8c
1808ded
 
 
e473647
44f2969
 
1808ded
44f2969
1031935
44f2969
 
 
1031935
44f2969
1808ded
1031935
44f2969
 
 
 
 
1808ded
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import librosa
from transformers import AutoProcessor, Wav2Vec2BertForCTC
import spaces

MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"

device = 0 if torch.cuda.is_available() else "cpu"

print("device:",device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = Wav2Vec2BertForCTC.from_pretrained(MODEL_NAME).to(device)


@spaces.GPU
def transcribe(audio_path):
    a, s = librosa.load(audio_path, sr=16_000)

    # inputs = processor(a, sampling_rate=s, return_tensors="pt")
    input_values = processor(a, sampling_rate=s, return_tensors="pt").input_features

    with torch.no_grad():
        logits = model(input_values.to(device)).logits
   
    predicted_ids = torch.argmax(logits, dim=-1)

    # transcribe speech
    transcription = processor.batch_decode(predicted_ids)
    return transcription[0]


iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Upload Audio File"),  # Audio file upload
    ],
    outputs="text",
    theme="huggingface",
    title="Czech W2V-BERT 2.0 speech encoder demo - transcribe Czech Audio",
    description=(
        "Transcribe audio inputs with the click of a button! Demo uses the fine-tuned"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Facebook W2V-BERT 2.0 speech encoder "
        "and 🤗 Transformers to transcribe audio files of arbitrary length."
    ),
    allow_flagging="never",
)

iface.launch(server_name="0.0.0.0")