Upload 3 files

Browse files

Files changed (3) hide show

scripts/evaluate_en_mteb/model_for_evaluate.py +83 -0
scripts/evaluate_en_mteb/mteb_utils.py +303 -0
scripts/evaluate_en_mteb/run_evaluate_mteb.py +24 -0

scripts/evaluate_en_mteb/model_for_evaluate.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import functools
+import torch
+import numpy as np
+from typing import Sequence, Any
+from mteb.encoder_interface import PromptType
+from mteb import Encoder
+from sentence_transformers import SentenceTransformer
+from mteb_utils import get_task_def_by_task_name_and_type, get_detailed_instruct, get_task_type_en
+def jasper_vl_forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
+    trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
+    if "pixel_values" in features:
+        trans_features["pixel_values"] = features["pixel_values"]
+    sentence_embedding = self.auto_model(**trans_features, **kwargs)["sentence_embedding"]
+    features.update({"sentence_embedding": sentence_embedding})
+    return features
+class MTEB_Sentence_Transformer(Encoder):
+    def __init__(
+            self,
+            model_path_or_name: str,
+            lang: str,
+            batch_size: int,
+            max_length: int,
+            device: str | None = None
+    ) -> None:
+        super().__init__(device=device)
+        model = SentenceTransformer(
+            model_path_or_name,
+            trust_remote_code=True,
+            device="cpu",
+            model_kwargs={
+                "torch_dtype": torch.bfloat16,
+                "attn_implementation": "sdpa"
+            },
+            config_kwargs={"is_text_encoder": True, "vector_dim": 12288},
+            tokenizer_kwargs={"padding_side": "right"}
+        )
+        model._first_module().forward = functools.partial(jasper_vl_forward, model._first_module())
+        self.model = model
+        self.pool = self.model.start_multi_process_pool()
+        self.lang = lang
+        self.batch_size = batch_size
+        self.model.max_seq_length = max_length
+    def encode(
+            self,
+            sentences: Sequence[str],
+            *,
+            task_name: str,
+            prompt_type: PromptType | None = None,
+            **kwargs: Any,
+    ) -> np.ndarray:
+        task_type = get_task_type_en(task_name)
+        do_normalize = True
+        instruction = get_detailed_instruct(get_task_def_by_task_name_and_type(task_name, task_type))
+        if task_type == "Retrieval":
+            if prompt_type == "query":
+                # print(instruction)
+                sentences = [instruction + sen for sen in sentences]
+            elif prompt_type == "passage":
+                pass
+            else:
+                raise ValueError(f"unknown prompt_type:{prompt_type}")
+        else:
+            sentences = [instruction + sen for sen in sentences]
+        # process white space data
+        sentences = [i if i.strip() else "<|endoftext|>" for i in sentences]
+        # print("First text: ", sentences[0])
+        vectors = self.model.encode_multi_process(
+            sentences=sentences,
+            pool=self.pool,
+            batch_size=self.batch_size,
+            show_progress_bar=True,
+            normalize_embeddings=do_normalize
+        )
+        vectors = vectors.astype(dtype=np.float32)
+        print("vectors.shape", vectors.shape)
+        return vectors

scripts/evaluate_en_mteb/mteb_utils.py ADDED Viewed

	@@ -0,0 +1,303 @@

+from typing import Dict
+LONG_TIME_TASK_NAMES = [
+    "MSMARCO",
+    "FEVER",
+    "HotpotQA",
+    "ClimateFEVER",
+    "DBPedia",
+    "NQ",
+    "ArxivClusteringP2P",
+    "ArxivClusteringS2S",
+    "RedditClusteringP2P",
+    "RedditClustering",
+    "QuoraRetrieval",
+    "StackExchangeClustering",
+    "Touche2020",
+    "MindSmallReranking",
+    "AmazonPolarityClassification",
+    "BiorxivClusteringP2P",
+    "StackExchangeClusteringP2P",
+    "TRECCOVID"
+]
+SHORT_TIME_TASK_NAMES = [
+    "BIOSSES",
+    "STS17",
+    "STS16",
+    "AskUbuntuDupQuestions",
+    "SummEval",
+    "SciFact",
+    "TweetSentimentExtractionClassification",
+    "EmotionClassification",
+    "SprintDuplicateQuestions"
+]
+MID_TIME_TASK_NAMES = ['BIOSSES', 'STS17', 'STS22', 'STS16', 'STSBenchmark', 'STS13', 'STS15', 'STS12', 'STS14',
+                       'AskUbuntuDupQuestions', 'TwitterSemEval2015', 'SummEval', 'SICK-R', 'NFCorpus', 'SciFact',
+                       'CQADupstackWebmastersRetrieval', 'TwitterURLCorpus', 'SprintDuplicateQuestions',
+                       'CQADupstackAndroidRetrieval', 'CQADupstackMathematicaRetrieval', 'ArguAna',
+                       'CQADupstackProgrammersRetrieval', 'SCIDOCS', 'StackOverflowDupQuestions',
+                       'EmotionClassification', 'TweetSentimentExtractionClassification', 'CQADupstackStatsRetrieval',
+                       'CQADupstackGisRetrieval', 'CQADupstackWordpressRetrieval', 'CQADupstackEnglishRetrieval',
+                       'CQADupstackPhysicsRetrieval', 'CQADupstackGamingRetrieval', 'SciDocsRR', 'FiQA2018',
+                       'CQADupstackUnixRetrieval', 'ToxicConversationsClassification', 'Banking77Classification',
+                       'TwentyNewsgroupsClustering', 'MedrxivClusteringS2S', 'ImdbClassification',
+                       'MTOPDomainClassification', 'BiorxivClusteringS2S', 'AmazonCounterfactualClassification',
+                       'MassiveScenarioClassification', 'MedrxivClusteringP2P', 'MTOPIntentClassification',
+                       'MassiveIntentClassification', 'CQADupstackTexRetrieval', 'AmazonReviewsClassification',
+                       'TRECCOVID', 'BiorxivClusteringP2P', 'StackExchangeClusteringP2P', 'StackExchangeClustering']
+CMTEB_TASK_LIST = ['TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'OnlineShopping', 'Waimai',
+                   'AmazonReviewsClassification', 'MassiveIntentClassification', 'MassiveScenarioClassification',
+                   'MultilingualSentiment',
+                   'CLSClusteringS2S', 'CLSClusteringP2P', 'ThuNewsClusteringS2S', 'ThuNewsClusteringP2P',
+                   'Ocnli', 'Cmnli',
+                   'T2Reranking', 'MmarcoReranking', 'CMedQAv1', 'CMedQAv2',
+                   'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval',
+                   'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
+                   'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC', 'STS22']
+TASK_LIST_CLASSIFICATION = [
+    "AmazonCounterfactualClassification",
+    "AmazonPolarityClassification",
+    "AmazonReviewsClassification",
+    "Banking77Classification",
+    "EmotionClassification",
+    "ImdbClassification",
+    "MassiveIntentClassification",
+    "MassiveScenarioClassification",
+    "MTOPDomainClassification",
+    "MTOPIntentClassification",
+    "ToxicConversationsClassification",
+    "TweetSentimentExtractionClassification",
+]
+TASK_LIST_CLUSTERING = [
+    "ArxivClusteringP2P",
+    "ArxivClusteringS2S",
+    "BiorxivClusteringP2P",
+    "BiorxivClusteringS2S",
+    "MedrxivClusteringP2P",
+    "MedrxivClusteringS2S",
+    "RedditClustering",
+    "RedditClusteringP2P",
+    "StackExchangeClustering",
+    "StackExchangeClusteringP2P",
+    "TwentyNewsgroupsClustering",
+]
+TASK_LIST_PAIR_CLASSIFICATION = [
+    "SprintDuplicateQuestions",
+    "TwitterSemEval2015",
+    "TwitterURLCorpus",
+]
+TASK_LIST_RERANKING = [
+    "AskUbuntuDupQuestions",
+    "MindSmallReranking",
+    "SciDocsRR",
+    "StackOverflowDupQuestions",
+]
+TASK_LIST_RETRIEVAL = [
+    "ArguAna",
+    "CQADupstackAndroidRetrieval",
+    "CQADupstackEnglishRetrieval",
+    "CQADupstackGamingRetrieval",
+    "CQADupstackGisRetrieval",
+    "CQADupstackMathematicaRetrieval",
+    "CQADupstackPhysicsRetrieval",
+    "CQADupstackProgrammersRetrieval",
+    "CQADupstackStatsRetrieval",
+    "CQADupstackTexRetrieval",
+    "CQADupstackUnixRetrieval",
+    "CQADupstackWebmastersRetrieval",
+    "CQADupstackWordpressRetrieval",
+    "DBPedia",
+    "FEVER",
+    "FiQA2018",
+    "NFCorpus",
+    "NQ",
+    "QuoraRetrieval",
+    "SCIDOCS",
+    "SciFact",
+    "Touche2020",
+    "TRECCOVID",
+    "ClimateFEVER",
+    "HotpotQA",
+    "MSMARCO",
+]
+TASK_LIST_STS = [
+    "BIOSSES",
+    "SICK-R",
+    "STS12",
+    "STS13",
+    "STS14",
+    "STS15",
+    "STS16",
+    "STS17",
+    "STS22",
+    "STSBenchmark",
+    "SummEval",
+]
+MTEB_TASK_LIST = (
+        TASK_LIST_CLASSIFICATION
+        + TASK_LIST_CLUSTERING
+        + TASK_LIST_PAIR_CLASSIFICATION
+        + TASK_LIST_RERANKING
+        + TASK_LIST_STS
+        + TASK_LIST_RETRIEVAL
+)
+def get_task_type_en(task_name: str):
+    if task_name == "SummEval":
+        return "Summarization"
+    if task_name in TASK_LIST_CLASSIFICATION:
+        return "Classification"
+    if task_name in TASK_LIST_CLUSTERING:
+        return "Clustering"
+    if task_name in TASK_LIST_PAIR_CLASSIFICATION:
+        return "PairClassification"
+    if task_name in TASK_LIST_RERANKING:
+        return "Reranking"
+    if task_name in TASK_LIST_STS:
+        return "STS"
+    if task_name in TASK_LIST_RETRIEVAL:
+        return "Retrieval"
+    raise ValueError(f"unknown task name:{task_name}")
+def get_task_def_by_task_name_and_type(task_name: str, task_type: str) -> str:
+    if task_type in ['STS']:
+        return "Retrieve semantically similar text."
+    if task_type in ['Summarization']:
+        return "Given a news summary, retrieve other semantically similar summaries"
+    if task_type in ['BitextMining']:
+        return "Retrieve parallel sentences."
+    if task_type in ['Classification']:
+        task_name_to_instruct: Dict[str, str] = {
+            'AmazonCounterfactualClassification': 'Classify a given Amazon customer review text as either counterfactual or not-counterfactual',
+            'AmazonPolarityClassification': 'Classify Amazon reviews into positive or negative sentiment',
+            'AmazonReviewsClassification': 'Classify the given Amazon review into its appropriate rating category',
+            'Banking77Classification': 'Given a online banking query, find the corresponding intents',
+            'EmotionClassification': 'Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise',
+            'ImdbClassification': 'Classify the sentiment expressed in the given movie review text from the IMDB dataset',
+            'MassiveIntentClassification': 'Given a user utterance as query, find the user intents',
+            'MassiveScenarioClassification': 'Given a user utterance as query, find the user scenarios',
+            'MTOPDomainClassification': 'Classify the intent domain of the given utterance in task-oriented conversation',
+            'MTOPIntentClassification': 'Classify the intent of the given utterance in task-oriented conversation',
+            'ToxicConversationsClassification': 'Classify the given comments as either toxic or not toxic',
+            'TweetSentimentExtractionClassification': 'Classify the sentiment of a given tweet as either positive, negative, or neutral',
+            # C-MTEB eval instructions
+            'TNews': 'Classify the fine-grained category of the given news title',
+            'IFlyTek': 'Given an App description text, find the appropriate fine-grained category',
+            'MultilingualSentiment': 'Classify sentiment of the customer review into positive, neutral, or negative',
+            'JDReview': 'Classify the customer review for iPhone on e-commerce platform into positive or negative',
+            'OnlineShopping': 'Classify the customer review for online shopping into positive or negative',
+            'Waimai': 'Classify the customer review from a food takeaway platform into positive or negative',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Clustering']:
+        task_name_to_instruct: Dict[str, str] = {
+            'ArxivClusteringP2P': 'Identify the main and secondary category of Arxiv papers based on the titles and abstracts',
+            'ArxivClusteringS2S': 'Identify the main and secondary category of Arxiv papers based on the titles',
+            'BiorxivClusteringP2P': 'Identify the main category of Biorxiv papers based on the titles and abstracts',
+            'BiorxivClusteringS2S': 'Identify the main category of Biorxiv papers based on the titles',
+            'MedrxivClusteringP2P': 'Identify the main category of Medrxiv papers based on the titles and abstracts',
+            'MedrxivClusteringS2S': 'Identify the main category of Medrxiv papers based on the titles',
+            'RedditClustering': 'Identify the topic or theme of Reddit posts based on the titles',
+            'RedditClusteringP2P': 'Identify the topic or theme of Reddit posts based on the titles and posts',
+            'StackExchangeClustering': 'Identify the topic or theme of StackExchange posts based on the titles',
+            'StackExchangeClusteringP2P': 'Identify the topic or theme of StackExchange posts based on the given paragraphs',
+            'TwentyNewsgroupsClustering': 'Identify the topic or theme of the given news articles',
+            # C-MTEB eval instructions
+            'CLSClusteringS2S': 'Identify the main category of scholar papers based on the titles',
+            'CLSClusteringP2P': 'Identify the main category of scholar papers based on the titles and abstracts',
+            'ThuNewsClusteringS2S': 'Identify the topic or theme of the given news articles based on the titles',
+            'ThuNewsClusteringP2P': 'Identify the topic or theme of the given news articles based on the titles and contents',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Reranking', 'PairClassification']:
+        task_name_to_instruct: Dict[str, str] = {
+            'AskUbuntuDupQuestions': 'Retrieve duplicate questions from AskUbuntu forum',
+            'MindSmallReranking': 'Retrieve relevant news articles based on user browsing history',
+            'SciDocsRR': 'Given a title of a scientific paper, retrieve the titles of other relevant papers',
+            'StackOverflowDupQuestions': 'Retrieve duplicate questions from StackOverflow forum',
+            'SprintDuplicateQuestions': 'Retrieve duplicate questions from Sprint forum',
+            'TwitterSemEval2015': 'Retrieve tweets that are semantically similar to the given tweet',
+            'TwitterURLCorpus': 'Retrieve tweets that are semantically similar to the given tweet',
+            # C-MTEB eval instructions
+            'T2Reranking': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'MMarcoReranking': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'CMedQAv1': 'Given a Chinese community medical question, retrieve replies that best answer the question',
+            'CMedQAv2': 'Given a Chinese community medical question, retrieve replies that best answer the question',
+            'Ocnli': 'Retrieve semantically similar text.',
+            'Cmnli': 'Retrieve semantically similar text.',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Retrieval']:
+        if task_name.lower().startswith('cqadupstack'):
+            return 'Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question'
+        task_name_to_instruct: Dict[str, str] = {
+            'ArguAna': 'Given a claim, find documents that refute the claim',
+            'ClimateFEVER': 'Given a claim about climate change, retrieve documents that support or refute the claim',
+            'DBPedia': 'Given a query, retrieve relevant entity descriptions from DBPedia',
+            'FEVER': 'Given a claim, retrieve documents that support or refute the claim',
+            'FiQA2018': 'Given a financial question, retrieve user replies that best answer the question',
+            'HotpotQA': 'Given a multi-hop question, retrieve documents that can help answer the question',
+            'MSMARCO': 'Given a web search query, retrieve relevant passages that answer the query.',
+            'NFCorpus': 'Given a question, retrieve relevant documents that best answer the question',
+            'NQ': 'Given a question, retrieve Wikipedia passages that answer the question',
+            'QuoraRetrieval': 'Given a question, retrieve questions that are semantically equivalent to the given question',
+            'SCIDOCS': 'Given a scientific paper title, retrieve paper abstracts that are cited by the given paper',
+            'SciFact': 'Given a scientific claim, retrieve documents that support or refute the claim',
+            'Touche2020': 'Given a question, retrieve detailed and persuasive arguments that answer the question',
+            'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
+            # C-MTEB eval instructions
+            'T2Retrieval': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'MMarcoRetrieval': 'Given a web search query, retrieve relevant passages that answer the query',
+            'DuRetrieval': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'CovidRetrieval': 'Given a question on COVID-19, retrieve news articles that answer the question',
+            'CmedqaRetrieval': 'Given a Chinese community medical question, retrieve replies that best answer the question',
+            'EcomRetrieval': 'Given a user query from an e-commerce website, retrieve description sentences of relevant products',
+            'MedicalRetrieval': 'Given a medical question, retrieve user replies that best answer the question',
+            'VideoRetrieval': 'Given a video search query, retrieve the titles of relevant videos',
+        }
+        # add lower case keys to match some beir names
+        task_name_to_instruct.update({k.lower(): v for k, v in task_name_to_instruct.items()})
+        # other cases where lower case match still doesn't work
+        task_name_to_instruct['trec-covid'] = task_name_to_instruct['TRECCOVID']
+        task_name_to_instruct['climate-fever'] = task_name_to_instruct['ClimateFEVER']
+        task_name_to_instruct['dbpedia-entity'] = task_name_to_instruct['DBPedia']
+        task_name_to_instruct['webis-touche2020'] = task_name_to_instruct['Touche2020']
+        task_name_to_instruct['fiqa'] = task_name_to_instruct['FiQA2018']
+        task_name_to_instruct['quora'] = task_name_to_instruct['QuoraRetrieval']
+        # for miracl evaluation
+        task_name_to_instruct['miracl'] = 'Given a question, retrieve Wikipedia passages that answer the question'
+        return task_name_to_instruct[task_name]
+    raise ValueError(f"No instruction config for task {task_name} with type {task_type}")
+def get_detailed_instruct(task_description: str) -> str:
+    if not task_description:
+        return ''
+    return 'Instruct: {}\nQuery: '.format(task_description)
+if __name__ == "__main__":
+    print(len(MTEB_TASK_LIST))

scripts/evaluate_en_mteb/run_evaluate_mteb.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+# Please comment the following line of code according to the actual situation
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+import mteb
+from model_for_evaluate import MTEB_Sentence_Transformer
+if __name__ == "__main__":
+    model_name = "valid_jasper"
+    model = MTEB_Sentence_Transformer(
+        model_path_or_name="infgrad/jasper_en_vision_language_v1",
+        lang="en",
+        batch_size=27,
+        max_length=400,
+    )
+    tasks = list(mteb.get_benchmark("MTEB(eng, classic)"))
+    evaluation = mteb.MTEB(tasks=tasks)
+    evaluation.run(
+        model,
+        output_folder=f"./en_results/{model_name}",
+        overwrite_results=False,
+        verbosity=3
+    )
+    model.model.stop_multi_process_pool(model.pool)