Sakil's picture
Create app.py
1a6ae54
raw
history blame
914 Bytes
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import gradio as gr
from transformers import pipeline
import IPython.display as display
def speech_text(audio_file):
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
speech, rate = librosa.load(audio_file,sr=16000)
display.Audio(audio_file, autoplay=True)
print(rate)
input_values = tokenizer(speech, return_tensors ='pt').input_values
#Store logits (non-normalized predictions)
logits = model(input_values).logits
#Store predicted id's
predicted_ids = torch.argmax(logits, dim =-1)
transcriptions = tokenizer.decode(predicted_ids[0])
return transcriptions
iface = gr.Interface(speech_text,inputs="audio",outputs="text",title='Sakil Transcription',description="Transcription")
iface.launch(inline=False)