import gradio as gr import torch import librosa import time import pandas as pd from datetime import datetime from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor DESCRIPTION = "Store a record of previous calls in order to verify if the client already called or not. Pretrained on `https://huggingface.co/datasets/superb` using [S3PRL recipe](https://github.com/s3prl/s3prl/tree/master/s3prl/downstream/voxceleb1)." # COLUMNS = ["call_id", "date", "client_id", "duration", "new"] model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-large-superb-sid") feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-large-superb-sid") def file_to_array(path): speech, _ = librosa.load(path, sr=16000, mono=True) duration = librosa.get_duration(y=speech) return speech, duration def handler(audio_path): calls = pd.read_csv("call_records.csv") speech, duration = file_to_array(audio_path) # compute attention masks and normalize the waveform if needed inputs = feature_extractor(speech, sampling_rate=16000, padding=True, return_tensors="pt") logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()] client_id = labels[0] call_id = str(int(time.time())) date = datetime.now().strftime("%d/%m/%Y %H:%M:%S") n_of_calls = len(calls.loc[calls.client_id == client_id]) new = n_of_calls == 0 # add new call record record = [call_id, date, client_id, duration, new] calls.loc[len(calls)] = record calls.to_csv("call_records.csv", index=False) if new: return f"New client call: Client ID {client_id}" return f"Client {client_id} calling again: {n_of_calls} previous calls" first = gr.Interface( fn=handler, inputs=gr.Audio(label="Speech Audio", type="filepath"), outputs=gr.Text(label="Output", value="..."), description=DESCRIPTION ) second = gr.Interface( fn=handler, inputs=gr.Audio(label="Microphone Input", source="microphone", type="filepath"), outputs=gr.Text(label="Output", value="..."), description=DESCRIPTION ) app = gr.TabbedInterface( [first, second], title="Speaker Call Verification 🎤", tab_names=["Audio Upload", "Microphone"], ) app.launch()