Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
import torch | |
import phonemizer | |
import librosa | |
import math | |
import io | |
import base64 | |
from strsimpy.jaro_winkler import JaroWinkler | |
def speechToPhonemeWS(audioAsB64): | |
wav_data = base64.b64decode(audioAsB64.encode("utf-8")) | |
processor = Wav2Vec2Processor.from_pretrained( | |
"facebook/wav2vec2-xlsr-53-espeak-cv-ft" | |
) | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft") | |
waveform, sample_rate = librosa.load( | |
io.BytesIO(wav_data), sr=16000 | |
) # Downsample 44.1kHz to 8kHz | |
input_values = processor( | |
waveform, sampling_rate=sample_rate, return_tensors="pt" | |
).input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids) | |
speechToPhonemeTranscription = transcription[0] | |
speechToPhonemeTranscription = speechToPhonemeTranscription.replace(" ", "") | |
return speechToPhonemeTranscription | |
def speechToTextToPhonemeWS(audioAsB64): | |
wav_data = base64.b64decode(audioAsB64.encode("utf-8")) | |
waveform, sample_rate = librosa.load( | |
io.BytesIO(wav_data), sr=16000 | |
) # Downsample 44.1kHz to 8kHz | |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") | |
input_values = processor( | |
waveform, sampling_rate=sample_rate, return_tensors="pt" | |
).input_values | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
speechToTextTranscription = processor.batch_decode(predicted_ids) | |
graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscription[0]) | |
graphemeToPhonemeTranscription = graphemeToPhonemeTranscription.replace(" ", "") | |
return [speechToTextTranscription[0], graphemeToPhonemeTranscription] | |
def similarity(S2P, G2P2T): | |
jarowinkler = JaroWinkler() | |
similarity_score = jarowinkler.similarity(S2P, G2P2T) | |
return similarity_score | |
def similarityScoreToBand(similarity_score): | |
if similarity_score >= 0.91: | |
return 9 | |
elif similarity_score >= 0.81: | |
return 8 | |
elif similarity_score >= 0.73: | |
return 7 | |
elif similarity_score >= 0.65: | |
return 6 | |
elif similarity_score >= 0.60: | |
return 5 | |
elif similarity_score >= 0.46: | |
return 4 | |
elif similarity_score >= 0.35: | |
return 3 | |
elif similarity_score >= 0.1: | |
return 2 | |
else: | |
return 1 | |
def lark(audioAsB64): | |
s2p = speechToPhonemeWS(audioAsB64) | |
[s2t, s2t2p] = speechToTextToPhonemeWS(audioAsB64) | |
ss = similarity(s2t2p, s2p) | |
band = similarityScoreToBand(ss) | |
return [ss, band, s2t] | |
iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text", "text"]) | |
iface.launch() | |