ixxan's picture
Update asr.py
3da96bb verified
raw
history blame
1.9 kB
import torchaudio
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import numpy as np
# Load processor and model
processor = AutoProcessor.from_pretrained("ixxan/whisper-small-common-voice-ug")
model = AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-common-voice-ug")
target_sr = processor.feature_extractor.sampling_rate
def transcribe(audio_data) -> str:
"""
Transcribes audio to text using the Whisper model for Uyghur.
Args:
- audio_data: Gradio audio input
Returns:
- str: The transcription of the audio.
"""
# Load audio file
if not audio_data:
return "<<ERROR: Empty Audio Input>>"
if isinstance(audio_data, tuple):
# microphone
sampling_rate, audio_input = audio_data
audio_input = (audio_input / 32768.0).astype(np.float32)
elif isinstance(audio_data, str):
# file upload
audio_input, sampling_rate = torchaudio.load(audio_data)
else:
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
# Resample if needed
if sampling_rate != target_sr:
resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
audio_input = resampler(audio_input)
# Preprocess the audio input
inputs = processor(audio_input.squeeze(), sampling_rate=target_sr, return_tensors="pt")
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
inputs = {key: val.to(device) for key, val in inputs.items()}
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(inputs["input_features"], max_length=225)
# Decode the output to get the transcription text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return transcription