guymandude's picture
Update README.md
dea81df verified
metadata
license: cc-by-4.0
datasets:
  - nwu-ctext/nchlt
language:
  - afr
  - eng
  - nbl
  - nso
  - sot
  - ssw
  - tsn
  - tso
  - ven
  - xho
  - zul
base_model: facebook/mms-1b-all
pipeline_tag: automatic-speech-recognition

Inference Example

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa
import os

model_name = "guymandude/MMS-ASR-ZA-11"

def load_audio_file(path):
    audio_array, sampling_rate = librosa.load(path, sr=None)
    return {"array": audio_array, "sampling_rate": sampling_rate}

model = Wav2Vec2ForCTC.from_pretrained(model_name,ignore_mismatched_sizes=True).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(model_name)
# change to supported languages [eng, afr, sot, zul, xho, nso, nbl, tso, tsn, ven, ssw]
model.load_adapter("tsn")
processor.tokenizer.set_target_lang("tsn")

audio = load_audio_file("<AUDIO PATH>")

input_dict = processor(audio["array"], sampling_rate=16_000, return_tensors="pt", padding=True)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

print(processor.decode(pred_ids))