import subprocess import sys # Function to install a package if not already installed def install_package(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) # Install Rust compiler if not present def install_rust(): try: subprocess.check_call(["rustc", "--version"]) except subprocess.CalledProcessError: subprocess.check_call(["curl", "--proto", "=https", "--tlsv1.2", "-sSf", "https://sh.rustup.rs", "|", "sh"]) # List of required packages required_packages = [ "transformers==4.10.3", "datasets", "huggingface-hub>=0.19", "hf-transfer>=0.1.4", "protobuf<4", "click<8.1", "pydantic~=1.0", "librosa==0.8.1", "torch==2.2.0", "torchaudio==2.2.0", "scipy", "Cython==0.29.21", "phonemizer==2.2.1", "scikit-learn", "matplotlib", "gradio==3.1.4", "sentencepiece", "sacremoses", "tokenizers==0.10.3", "resampy>=0.2.2", "numba>=0.43.0", "soundfile>=0.10.2", "pooch>=1.0", "decorator>=3.0.0", "joblib>=0.14", "audioread>=2.0.0" ] # Install all required packages for package in required_packages: install_package(package) import gradio as gr import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import librosa # Load pre-trained model and processor model_name = "facebook/wav2vec2-base-960h" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) def transcribe(audio): # Load audio audio_input, _ = librosa.load(audio, sr=16000) # Tokenize and process inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits # Get predicted ids predicted_ids = torch.argmax(logits, dim=-1) # Decode the ids to text transcription = processor.batch_decode(predicted_ids) return transcription[0] # Define the Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text" ) if __name__ == "__main__": install_rust() iface.launch()