File size: 2,234 Bytes
7a86b92
46a891a
2244bbb
46a891a
 
 
 
e26488d
 
 
 
 
 
 
46a891a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e26488d
 
46a891a
 
 
 
 
 
 
 
 
 
 
 
94780f8
7a86b92
 
 
 
382e37a
7a86b92
 
 
 
a7f185d
7a86b92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382e37a
7a86b92
 
 
 
 
 
2244bbb
 
e26488d
7a86b92
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import subprocess
import sys

# Function to install a package if not already installed
def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install Rust compiler if not present
def install_rust():
    try:
        subprocess.check_call(["rustc", "--version"])
    except subprocess.CalledProcessError:
        subprocess.check_call(["curl", "--proto", "=https", "--tlsv1.2", "-sSf", "https://sh.rustup.rs", "|", "sh"])

# List of required packages
required_packages = [
    "transformers==4.10.3",
    "datasets",
    "huggingface-hub>=0.19",
    "hf-transfer>=0.1.4",
    "protobuf<4",
    "click<8.1",
    "pydantic~=1.0",
    "librosa==0.8.1",
    "torch==2.2.0",
    "torchaudio==2.2.0",
    "scipy",
    "Cython==0.29.21",
    "phonemizer==2.2.1",
    "scikit-learn",
    "matplotlib",
    "gradio==3.1.4",
    "sentencepiece",
    "sacremoses",
    "tokenizers==0.10.3",
    "resampy>=0.2.2",
    "numba>=0.43.0",
    "soundfile>=0.10.2",
    "pooch>=1.0",
    "decorator>=3.0.0",
    "joblib>=0.14",
    "audioread>=2.0.0"
]

# Install all required packages
for package in required_packages:
    install_package(package)

import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa

# Load pre-trained model and processor
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

def transcribe(audio):
    # Load audio
    audio_input, _ = librosa.load(audio, sr=16000)
    
    # Tokenize and process
    inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    
    # Get predicted ids
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # Decode the ids to text
    transcription = processor.batch_decode(predicted_ids)
    return transcription[0]

# Define the Gradio interface
iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text"
)

if __name__ == "__main__":
    install_rust()
    iface.launch()