cahya's picture
add gradio app
9fbe6a6
raw
history blame
No virus
1.75 kB
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import gradio as gr
import sox
def convert(inputfile, outfile):
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(
file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
)
sox_tfm.build(inputfile, outfile)
model_name = "indonesian-nlp/wav2vec2-indonesian-javanese-sundanese"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
def parse_transcription(wav_file):
filename = wav_file.name.split('.')[0]
convert(wav_file.name, filename + "16k.wav")
speech, _ = sf.read(filename + "16k.wav")
input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
output = gr.outputs.Textbox(label="Indonesian, Javanese or Sundanese")
input_ = gr.inputs.Audio(source="microphone", type="file")
#gr.Interface(parse_transcription, inputs = input_, outputs="text",
# analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);
gr.Interface(parse_transcription, inputs = input_, outputs=[output],
analytics_enabled=False,
show_tips=False,
theme='huggingface',
layout='vertical',
title="Multilingual Speech Recognition for Indonesian Languages",
description="Speech Recognition Live Demo for Indonesian, Javanese and Sundanese",
enable_queue=True).launch( inline=False)