test / app.py
Aryan Wadhawan
Add application file
5473c42
raw history blame
No virus
1.01 kB
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import phonemizer
import librosa
import base64
def lark(audioAsB64):
# convert b64 audio to wav
with open("audio.wav", "wb") as preWaveform:
preWaveform.write(base64.b64encode())
# processing
processor = Wav2Vec2Processor.from_pretrained(
"facebook/wav2vec2-xlsr-53-espeak-cv-ft"
)
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
waveform, sample_rate = librosa.load(
"harvard.wav", sr=16000
) # Downsample 44.1kHz to 8kHz
input_values = processor(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return transcription
iface = gr.Interface(fn=lark, inputs="text", outputs="text")
iface.launch()