Spaces:
Running
Running
import torchaudio | |
import torch | |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
import numpy as np | |
# Load processor and model | |
processor = AutoProcessor.from_pretrained("ixxan/whisper-small-common-voice-ug") | |
model = AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-common-voice-ug") | |
target_sr = processor.feature_extractor.sampling_rate | |
def transcribe(audio_data) -> str: | |
""" | |
Transcribes audio to text using the Whisper model for Uyghur. | |
Args: | |
- audio_data: Gradio audio input | |
Returns: | |
- str: The transcription of the audio. | |
""" | |
# Load audio file | |
if not audio_data: | |
return "<<ERROR: Empty Audio Input>>" | |
if isinstance(audio_data, tuple): | |
# microphone | |
sampling_rate, audio_input = audio_data | |
audio_input = (audio_input / 32768.0).astype(np.float32) | |
elif isinstance(audio_data, str): | |
# file upload | |
audio_input, sampling_rate = torchaudio.load(audio_data) | |
else: | |
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data)) | |
# Resample if needed | |
if sampling_rate != target_sr: | |
resampler = torchaudio.transforms.Resample(sampling_rate, target_sr) | |
audio_input = resampler(audio_input) | |
# Preprocess the audio input | |
inputs = processor(audio_input.squeeze(), sampling_rate=target_sr, return_tensors="pt") | |
# Move model to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model.to(device) | |
inputs = {key: val.to(device) for key, val in inputs.items()} | |
# Generate transcription | |
with torch.no_grad(): | |
generated_ids = model.generate(inputs["input_features"], max_length=225) | |
# Decode the output to get the transcription text | |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return transcription |