Harveenchadha's picture
Update app.py
06628a1
raw
history blame
1.5 kB
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import argparse
from glob import glob
import subprocess
import gradio as gr
def get_filename(wav_file):
filename_local = wav_file.split('/')[-1][:-4]
filename_new = '/tmp/'+filename_local+'_16.wav'
subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(wav_file, str(16000), filename_new)], shell=True)
return filename_new
def parse_transcription(wav_file):
# load pretrained model
# load audio
wav_file = get_filename(wav_file.name)
audio_input, sample_rate = sf.read(wav_file)
#test_file = resampler(test_file[0])
# pad input values and return pt tensor
input_values = processor(audio_input, sampling_rate=16_000, return_tensors="pt").input_values
# INFERENCE
# retrieve logits & take argmax
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
input = gr.inputs.Audio(source="microphone", type="file")
gr.Interface(parse_transcription, inputs = input, outputs="text",
analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);