audio_to_text / app.py
siddh4rth's picture
pretrained model
6919ad7
import os
import gradio as gr
import whisper
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
def audio_to_text(audio):
model = whisper.load_model("base")
audio = whisper.load_audio(audio)
result = model.transcribe(audio)
return result["text"]
# tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
# logits = preprocess(audio)
# predicted_ids = torch.argmax(logits, dim=-1)
# transcriptions = tokenizer.decode(predicted_ids[0])
# return transcriptions
def preprocess(audio):
model_save_path = "model_save"
model_name = "wav2vec2_osr_version_1"
speech, rate = librosa.load(audio, sr=16000)
model_path = os.path.join(model_save_path, model_name+".pt")
pipeline_path = os.path.join(model_save_path, model_name+"_vocab")
access_token = "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"
processor = Wav2Vec2Processor.from_pretrained(pipeline_path, use_auth_token=access_token)
model = torch.load(model_path)
model.eval()
input_values = processor(speech, sampling_rate=rate, return_tensors="pt").input_values.to(device)
logits = model(input_values).logits
return logits
demo = gr.Interface(
fn=audio_to_text,
inputs=gr.Audio(source="upload", type="filepath"),
examples=[["example.flac"]],
outputs="text"
)
demo.launch()