Harveenchadha's picture
Update app.py
6c461f0
raw
history blame
1.97 kB
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import gradio as gr
import scipy.signal as sps
import sox
def convert(inputfile, outfile):
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(
file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
)
#print(this is not done)
sox_tfm.build(inputfile, outfile)
def read_file(wav):
sample_rate, signal = wav
signal = signal.mean(-1)
number_of_samples = round(len(signal) * float(16000) / sample_rate)
resampled_signal = sps.resample(signal, number_of_samples)
return resampled_signal
def parse_transcription(wav_file):
filename = wav_file.split('.')[0]
convert(wav_file, filename + "16k.wav")
speech, _ = sf.read(filename + "16k.wav")
#speech = read_file(wav_file)
input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
input_ = gr.inputs.Audio(source="microphone", type="filepath")
#input_ = gr.inputs.Audio(source="microphone", type="numpy")
gr.Interface(parse_transcription, inputs = input_, outputs="text",
analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);