import joblib from transformers import AutoFeatureExtractor, WavLMModel import torch import soundfile as sf import numpy as np import gradio as gr import librosa class HuggingFaceFeatureExtractor: def __init__(self, model_class, name): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.feature_extractor = AutoFeatureExtractor.from_pretrained(name) self.model = model_class.from_pretrained(name) self.model.eval() self.model.to(self.device) def __call__(self, audio, sr): inputs = self.feature_extractor( audio, sampling_rate=sr, return_tensors="pt", padding=True, ) inputs = {k: v.to(self.device) for k, v in inputs.items()} with torch.no_grad(): outputs = self.model(**inputs) return outputs.last_hidden_state FEATURE_EXTRACTORS = { "wavlm-base": lambda: HuggingFaceFeatureExtractor(WavLMModel, "microsoft/wavlm-base"), "wavLM-V1": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-DeepFake_UTCN"), "wavLM-V2": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-UTCN"), "wavLM-V3": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-UTCN_114k"), } model1 = joblib.load('model1.joblib') model2 = joblib.load('model2.joblib') model3 = joblib.load('model3.joblib') model4 = joblib.load('model4.joblib') final_model = joblib.load('final_model.joblib') def process_audio(file_audio): audio, sr = librosa.load(file_audio, sr=16000) # Resample to 16 kHz if len(audio.shape) > 1: audio = audio[0] extractor_1 = FEATURE_EXTRACTORS['wavlm-base']() extractor_2 = FEATURE_EXTRACTORS['wavLM-V1']() extractor_3 = FEATURE_EXTRACTORS['wavLM-V2']() extractor_4 = FEATURE_EXTRACTORS['wavLM-V3']() eval1 = extractor_1(audio, sr) eval1 = torch.mean(eval1, dim=1).cpu().numpy() eval2 = extractor_2(audio, sr) eval2 = torch.mean(eval2, dim=1).cpu().numpy() eval3 = extractor_3(audio, sr) eval3 = torch.mean(eval3, dim=1).cpu().numpy() eval4 = extractor_4(audio, sr) eval4 = torch.mean(eval4, dim=1).cpu().numpy() eval1 = eval1.reshape(1, -1) eval2 = eval2.reshape(1, -1) eval3 = eval3.reshape(1, -1) eval4 = eval4.reshape(1, -1) eval_prob1 = model1.predict_proba(eval1)[:, 1].reshape(-1, 1) eval_prob2 = model2.predict_proba(eval2)[:, 1].reshape(-1, 1) eval_prob3 = model3.predict_proba(eval3)[:, 1].reshape(-1, 1) eval_prob4 = model4.predict_proba(eval4)[:, 1].reshape(-1, 1) eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4)) final_prob = final_model.predict_proba(eval_combined_probs)[:, 1] if final_prob < 0.5: return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%" else: return f"Real with a confidence of: {final_prob[0] * 100:.2f}%" interface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Deepfake Detection", description="Upload an audio file to detect whether it is fake or real. The system uses features ensamble from wavLM base and finetuned versions. Submitted to ASVSpoof5.", ) interface.launch(share=True)