Spaces:
Sleeping
Sleeping
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
import torch | |
from umsc import UgMultiScriptConverter | |
import util | |
# Model ID and setup | |
model_id = 'ixxan/wav2vec2-large-mms-1b-uyghur-latin' | |
asr_model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang="uig-script_latin") | |
asr_processor = Wav2Vec2Processor.from_pretrained(model_id) | |
asr_processor.tokenizer.set_target_lang("uig-script_latin") | |
# Automatically allocate the device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
asr_model = asr_model.to(device) | |
def asr(user_audio): | |
# Load and resample user audio | |
audio_input, sampling_rate = util.load_and_resample_audio(user_audio, target_rate=16000) | |
# Process audio through ASR model | |
inputs = asr_processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt", padding=True) | |
inputs = {key: val.to(device) for key, val in inputs.items()} | |
with torch.no_grad(): | |
logits = asr_model(**inputs).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcript = asr_processor.batch_decode(predicted_ids)[0] | |
return transcript | |
def check_pronunciation(input_text, script, user_audio): | |
# Transcripts from user input audio | |
transcript_ugLatn_box = asr(user_audio) | |
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS') | |
transcript_ugArab_box = ug_latn_to_arab(transcript_ugLatn_box) | |
# Get IPA and Pronunciation Feedback | |
if script == 'Uyghur Latin': | |
input_text = ug_latn_to_arab(input_text) # make sure input text is arabic script to IPA conversion | |
correct_pronunciation, user_pronunciation, pronunciation_match, pronunciation_score = util.calculate_pronunciation_accuracy( | |
reference_text = input_text, | |
output_text = transcript_ugArab_box, | |
language_code='uig-Arab') | |
return transcript_ugArab_box, transcript_ugLatn_box, correct_pronunciation, user_pronunciation, pronunciation_match, pronunciation_score |