rexoscare's picture
Upload app.py
516501d
raw
history blame
1.88 kB
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import argparse
from glob import glob
import torchaudio
import subprocess
import gradio as gr
resampler = torchaudio.transforms.Resample(48_000, 16_000)
def get_filename(wav_file):
filename_local = wav_file.split('/')[-1][:-4]
filename_new = '/tmp/'+filename_local+'_16.wav'
subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(wav_file, str(16000), filename_new)], shell=True)
return filename_new
def parse_transcription(wav_file):
# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
# load audio
wav_file = get_filename(wav_file.name)
audio_input, sample_rate = sf.read(wav_file)
#test_file = resampler(test_file[0])
# pad input values and return pt tensor
input_values = processor(audio_input, sampling_rate=16_000, return_tensors="pt").input_values
# INFERENCE
# retrieve logits & take argmax
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
title = "Speech-to-Text (Hindi) using Vakyansh"
description = "Upload a hindi audio clip, and let AI do the hard work of transcribing."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2104.06678'>Large-Scale Self- and Semi-Supervised Learning for Speech Translation</a></p>"
gr.Interface(
parse_transcription,
title=title,
inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"),
description=description, article = article, outputs = "text").launch(inline = False)