import gradio as gr from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import torch import phonemizer import librosa import math import io import base64 from strsimpy.jaro_winkler import JaroWinkler # base64 to audio ✅ # audio to transcription ✅ # audio to text ✅ # text to phoneme ✅ # accuracy = jarowinkler(transcription, phoneme) ✅ # band = getBandFromAccuracy(accuracy) ✅ # return accuracy, band ✅ def lark(audioAsB64): # base64 to wav data conversion wav_data = base64.b64decode(audioAsB64.encode("utf-8")) # audio to transcription processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-xlsr-53-espeak-cv-ft" ) model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft") waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000) input_values = processor( waveform, sampling_rate=sample_rate, return_tensors="pt" ).input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0] # audio to text processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") input_values = processorSTT( waveform, sampling_rate=sample_rate, return_tensors="pt" ).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) speechToTextTranscripition = processor.batch_decode(predicted_ids)[0] # text to phoneme graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition) # accuracy = jaroWinkler(transcription, phoneme) jarowinkler = JaroWinkler() similarity_score = jarowinkler.similarity( speechToPhonemeTranscription, graphemeToPhonemeTranscription ) # ielts pronunciation band estimation def getBandFromSimilarityScore(similarity_score): if similarity_score >= 0.91: return 9 elif similarity_score >= 0.81: return 8 elif similarity_score >= 0.73: return 7 elif similarity_score >= 0.65: return 6 elif similarity_score >= 0.60: return 5 elif similarity_score >= 0.46: return 4 elif similarity_score >= 0.35: return 3 elif similarity_score >= 0.1: return 2 else: return 1 IELTSband = getBandFromSimilarityScore(similarity_score) return [similarity_score, IELTSband] iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text"]) iface.launch()