jvcanavarro's picture
Add app v0.1
8b843d9
raw
history blame
No virus
2.39 kB
import gradio as gr
import torch
import librosa
import time
import pandas as pd
from datetime import datetime
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
DESCRIPTION = "Store a record of previous calls in order to verify if the client already called or not. Pretrained on `https://huggingface.co/datasets/superb` using [S3PRL recipe](https://github.com/s3prl/s3prl/tree/master/s3prl/downstream/voxceleb1)."
# COLUMNS = ["call_id", "date", "client_id", "duration", "new"]
model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-large-superb-sid")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-large-superb-sid")
def file_to_array(path):
speech, _ = librosa.load(path, sr=16000, mono=True)
duration = librosa.get_duration(y=speech)
return speech, duration
def handler(audio_path):
calls = pd.read_csv("call_records.csv")
speech, duration = file_to_array(audio_path)
# compute attention masks and normalize the waveform if needed
inputs = feature_extractor(speech, sampling_rate=16000, padding=True, return_tensors="pt")
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()]
client_id = labels[0]
call_id = str(int(time.time()))
date = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
n_of_calls = len(calls.loc[calls.client_id == client_id])
new = n_of_calls == 0
# add new call record
record = [call_id, date, client_id, duration, new]
calls.loc[len(calls)] = record
calls.to_csv("call_records.csv", index=False)
if new:
return f"New client call: Client ID {client_id}"
return f"Client {client_id} calling again: {n_of_calls} previous calls"
first = gr.Interface(
fn=handler,
inputs=gr.Audio(label="Speech Audio", type="filepath"),
outputs=gr.Text(label="Output", value="..."),
description=DESCRIPTION
)
second = gr.Interface(
fn=handler,
inputs=gr.Audio(label="Microphone Input", source="microphone", type="filepath"),
outputs=gr.Text(label="Output", value="..."),
description=DESCRIPTION
)
app = gr.TabbedInterface(
[first, second],
title="Speaker Call Verification 🎤",
tab_names=["Audio Upload", "Microphone"],
)
app.launch()