Forbu14's picture
adding main files
69d022a
from mteb import MTEB
import torch
import clip
import numpy as np
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL, PREPROCESS = clip.load("RN50", device=DEVICE)
TASK_LIST_CLASSIFICATION = [
"AmazonCounterfactualClassification",
"AmazonPolarityClassification",
"AmazonReviewsClassification",
"Banking77Classification",
"EmotionClassification",
"ImdbClassification",
"MassiveIntentClassification",
"MassiveScenarioClassification",
"MTOPDomainClassification",
"MTOPIntentClassification",
"ToxicConversationsClassification",
"TweetSentimentExtractionClassification",
]
TASK_LIST_CLUSTERING = [
"ArxivClusteringP2P",
"ArxivClusteringS2S",
"BiorxivClusteringP2P",
"BiorxivClusteringS2S",
"MedrxivClusteringP2P",
"MedrxivClusteringS2S",
"RedditClustering",
"RedditClusteringP2P",
"StackExchangeClustering",
"StackExchangeClusteringP2P",
"TwentyNewsgroupsClustering",
]
TASK_LIST_PAIR_CLASSIFICATION = [
"SprintDuplicateQuestions",
"TwitterSemEval2015",
"TwitterURLCorpus",
]
TASK_LIST_RERANKING = [
"AskUbuntuDupQuestions",
"MindSmallReranking",
"SciDocsRR",
"StackOverflowDupQuestions",
]
TASK_LIST_RETRIEVAL = [
"ArguAna",
"ClimateFEVER",
"CQADupstackAndroidRetrieval",
"CQADupstackEnglishRetrieval",
"CQADupstackGamingRetrieval",
"CQADupstackGisRetrieval",
"CQADupstackMathematicaRetrieval",
"CQADupstackPhysicsRetrieval",
"CQADupstackProgrammersRetrieval",
"CQADupstackStatsRetrieval",
"CQADupstackTexRetrieval",
"CQADupstackUnixRetrieval",
"CQADupstackWebmastersRetrieval",
"CQADupstackWordpressRetrieval",
"DBPedia",
"FEVER",
"FiQA2018",
"HotpotQA",
"MSMARCO",
"NFCorpus",
"NQ",
"QuoraRetrieval",
"SCIDOCS",
"SciFact",
"Touche2020",
"TRECCOVID",
]
TASK_LIST_STS = [
"BIOSSES",
"SICK-R",
"STS12",
"STS13",
"STS14",
"STS15",
"STS16",
"STS17",
"STS22",
"STSBenchmark",
"SummEval",
]
TASK_LIST = TASK_LIST_CLASSIFICATION
+ TASK_LIST_CLUSTERING
+ TASK_LIST_PAIR_CLASSIFICATION
+ TASK_LIST_RERANKING
+ TASK_LIST_RETRIEVAL
+ TASK_LIST_STS
class ClipModel:
"""
This is an wrapper class for the clip embedding model.
"""
def encode(self, sentences, batch_size=1, **kwargs):
"""Returns a list of embeddings for the given sentences.
Args:
sentences (`List[str]`): List of sentences to encode
batch_size (`int`): Batch size for the encoding
Returns:
`List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
"""
embeddings = []
for i in range(0, len(sentences)):
batch = sentences[i]
try:
text = clip.tokenize(batch).to(DEVICE)[
:, :77
] # clip.tokenize(batch).to(DEVICE)
with torch.no_grad():
text_features = MODEL.encode_text(text)
except:
print("too long token")
text = clip.tokenize(batch[: (77 * 2)]).to(DEVICE)[
:, :77
] # clip.tokenize(batch).to(DEVICE)
with torch.no_grad():
text_features = MODEL.encode_text(text)
embeddings.append(text_features.cpu().numpy().squeeze())
return embeddings
model = ClipModel()
evaluation = MTEB(tasks=TASK_LIST, output_folder=f"results/clip/", task_langs=["en"])
evaluation.run(model)