|
import torch |
|
import librosa |
|
from transformers import AutoModelForCTC, Wav2Vec2Processor |
|
|
|
|
|
model = AutoModelForCTC.from_pretrained("aoxo/wav2vec2-large-mal") |
|
processor = Wav2Vec2Processor.from_pretrained("aoxo/wav2vec2-large-mal") |
|
|
|
|
|
def transcribe_audio(audio_path): |
|
|
|
|
|
waveform, _ = librosa.load(audio_path, sr=16000) |
|
|
|
|
|
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(inputs.input_values).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.batch_decode(predicted_ids)[0] |
|
|
|
return transcription |
|
|
|
|
|
audio_path = "path/to/your/audio/file.wav" |
|
transcription = transcribe_audio(audio_path) |
|
print("Transcription:", transcription) |