Spaces:
Running
Running
import torchaudio | |
import torch | |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
# Load processor and model | |
processor = AutoProcessor.from_pretrained("ixxan/whisper-small-ug-cv-15") | |
model = AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-ug-cv-15") | |
def transcribe(audio_path: str) -> str: | |
""" | |
Transcribes audio to text using the Whisper model for Uyghur. | |
Args: | |
- audio_path (str): Path to the audio file to transcribe. | |
Returns: | |
- str: The transcription of the audio. | |
""" | |
# Load audio file | |
audio_input, sampling_rate = torchaudio.load(audio_path) | |
# Resample if needed | |
if sampling_rate != processor.feature_extractor.sampling_rate: | |
resampler = torchaudio.transforms.Resample(sampling_rate, processor.feature_extractor.sampling_rate) | |
audio_input = resampler(audio_input) | |
# Preprocess the audio input | |
inputs = processor(audio_input.squeeze(), sampling_rate=16000, return_tensors="pt") | |
# Move model to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model.to(device) | |
inputs = {key: val.to(device) for key, val in inputs.items()} | |
# Generate transcription | |
with torch.no_grad(): | |
generated_ids = model.generate(inputs["input_features"], max_length=225) | |
# Decode the output to get the transcription text | |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return transcription |