|
import joblib |
|
from transformers import AutoFeatureExtractor, WavLMModel |
|
import torch |
|
import soundfile as sf |
|
import numpy as np |
|
import gradio as gr |
|
import librosa |
|
|
|
class HuggingFaceFeatureExtractor: |
|
def __init__(self, model_class, name): |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
self.feature_extractor = AutoFeatureExtractor.from_pretrained(name) |
|
self.model = model_class.from_pretrained(name) |
|
self.model.eval() |
|
self.model.to(self.device) |
|
|
|
def __call__(self, audio, sr): |
|
inputs = self.feature_extractor( |
|
audio, |
|
sampling_rate=sr, |
|
return_tensors="pt", |
|
padding=True, |
|
) |
|
inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
with torch.no_grad(): |
|
outputs = self.model(**inputs) |
|
return outputs.last_hidden_state |
|
|
|
FEATURE_EXTRACTORS = { |
|
"wavlm-base": lambda: HuggingFaceFeatureExtractor(WavLMModel, "microsoft/wavlm-base"), |
|
"wavLM-V1": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-DeepFake_UTCN"), |
|
"wavLM-V2": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-UTCN"), |
|
"wavLM-V3": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-UTCN_114k"), |
|
} |
|
|
|
model1 = joblib.load('model1.joblib') |
|
model2 = joblib.load('model2.joblib') |
|
model3 = joblib.load('model3.joblib') |
|
model4 = joblib.load('model4.joblib') |
|
final_model = joblib.load('final_model.joblib') |
|
|
|
def process_audio(file_audio): |
|
audio, sr = librosa.load(file_audio, sr=16000) |
|
|
|
if len(audio.shape) > 1: |
|
audio = audio[0] |
|
|
|
extractor_1 = FEATURE_EXTRACTORS['wavlm-base']() |
|
extractor_2 = FEATURE_EXTRACTORS['wavLM-V1']() |
|
extractor_3 = FEATURE_EXTRACTORS['wavLM-V2']() |
|
extractor_4 = FEATURE_EXTRACTORS['wavLM-V3']() |
|
|
|
eval1 = extractor_1(audio, sr) |
|
eval1 = torch.mean(eval1, dim=1).cpu().numpy() |
|
|
|
eval2 = extractor_2(audio, sr) |
|
eval2 = torch.mean(eval2, dim=1).cpu().numpy() |
|
|
|
eval3 = extractor_3(audio, sr) |
|
eval3 = torch.mean(eval3, dim=1).cpu().numpy() |
|
|
|
eval4 = extractor_4(audio, sr) |
|
eval4 = torch.mean(eval4, dim=1).cpu().numpy() |
|
|
|
eval1 = eval1.reshape(1, -1) |
|
eval2 = eval2.reshape(1, -1) |
|
eval3 = eval3.reshape(1, -1) |
|
eval4 = eval4.reshape(1, -1) |
|
|
|
eval_prob1 = model1.predict_proba(eval1)[:, 1].reshape(-1, 1) |
|
eval_prob2 = model2.predict_proba(eval2)[:, 1].reshape(-1, 1) |
|
eval_prob3 = model3.predict_proba(eval3)[:, 1].reshape(-1, 1) |
|
eval_prob4 = model4.predict_proba(eval4)[:, 1].reshape(-1, 1) |
|
|
|
eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4)) |
|
|
|
final_prob = final_model.predict_proba(eval_combined_probs)[:, 1] |
|
|
|
if final_prob < 0.5: |
|
return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%" |
|
else: |
|
return f"Real with a confidence of: {final_prob[0] * 100:.2f}%" |
|
|
|
interface = gr.Interface( |
|
fn=process_audio, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs="text", |
|
title="Audio Deepfake Detection", |
|
description="Upload an audio file to detect whether it is fake or real. The system uses features ensamble from wavLM base and finetuned versions. Submitted to ASVSpoof5.", |
|
) |
|
|
|
interface.launch(share=True) |
|
|