import torch import evaluate from transformers import AutoModel, AutoProcessor, pipeline def clap_similarity(clap_model_name_or_path, texts, audios, device): clap = AutoModel.from_pretrained(clap_model_name_or_path) clap_processor = AutoProcessor.from_pretrained(clap_model_name_or_path) clap_inputs = clap_processor(text=texts, audios=audios, padding=True, return_tensors="pt").to(device) clap.to(device) with torch.no_grad(): text_features = clap.get_text_features( clap_inputs["input_ids"], attention_mask=clap_inputs.get("attention_mask", None) ) audio_features = clap.get_audio_features(clap_inputs["input_features"]) cosine_sim = torch.nn.functional.cosine_similarity(audio_features, text_features, dim=1, eps=1e-8) clap.to("cpu") clap_inputs.to("cpu") return cosine_sim.mean().to("cpu") def wer(asr_model_name_or_path, prompts, audios, device, per_device_eval_batch_size, sampling_rate): metric = evaluate.load("wer") asr_pipeline = pipeline(model=asr_model_name_or_path, device=device) transcriptions = asr_pipeline( [{"raw": audio, "sampling_rate": sampling_rate} for audio in audios], batch_size=int(per_device_eval_batch_size), ) word_error = 100 * metric.compute( predictions=[t["text"].lower() for t in transcriptions], references=[t.lower() for t in prompts] ) return word_error, [t["text"] for t in transcriptions]