Forbu14 commited on
Commit
69d022a
1 Parent(s): 465d01a

adding main files

Browse files
Files changed (5) hide show
  1. .gitignore +3 -0
  2. Dockerfile +7 -0
  3. eval.py +147 -0
  4. mteb_meta.py +118 -0
  5. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ .DS_Store
3
+ *.json
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM huggingface/transformers-pytorch-cpu:latest
2
+
3
+ # install requirements
4
+ COPY requirements.txt .
5
+ RUN pip install -r requirements.txt
6
+
7
+
eval.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mteb import MTEB
2
+ import torch
3
+ import clip
4
+
5
+ import numpy as np
6
+
7
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
8
+ MODEL, PREPROCESS = clip.load("RN50", device=DEVICE)
9
+
10
+
11
+ TASK_LIST_CLASSIFICATION = [
12
+ "AmazonCounterfactualClassification",
13
+ "AmazonPolarityClassification",
14
+ "AmazonReviewsClassification",
15
+ "Banking77Classification",
16
+ "EmotionClassification",
17
+ "ImdbClassification",
18
+ "MassiveIntentClassification",
19
+ "MassiveScenarioClassification",
20
+ "MTOPDomainClassification",
21
+ "MTOPIntentClassification",
22
+ "ToxicConversationsClassification",
23
+ "TweetSentimentExtractionClassification",
24
+ ]
25
+
26
+ TASK_LIST_CLUSTERING = [
27
+ "ArxivClusteringP2P",
28
+ "ArxivClusteringS2S",
29
+ "BiorxivClusteringP2P",
30
+ "BiorxivClusteringS2S",
31
+ "MedrxivClusteringP2P",
32
+ "MedrxivClusteringS2S",
33
+ "RedditClustering",
34
+ "RedditClusteringP2P",
35
+ "StackExchangeClustering",
36
+ "StackExchangeClusteringP2P",
37
+ "TwentyNewsgroupsClustering",
38
+ ]
39
+
40
+ TASK_LIST_PAIR_CLASSIFICATION = [
41
+ "SprintDuplicateQuestions",
42
+ "TwitterSemEval2015",
43
+ "TwitterURLCorpus",
44
+ ]
45
+
46
+ TASK_LIST_RERANKING = [
47
+ "AskUbuntuDupQuestions",
48
+ "MindSmallReranking",
49
+ "SciDocsRR",
50
+ "StackOverflowDupQuestions",
51
+ ]
52
+
53
+ TASK_LIST_RETRIEVAL = [
54
+ "ArguAna",
55
+ "ClimateFEVER",
56
+ "CQADupstackAndroidRetrieval",
57
+ "CQADupstackEnglishRetrieval",
58
+ "CQADupstackGamingRetrieval",
59
+ "CQADupstackGisRetrieval",
60
+ "CQADupstackMathematicaRetrieval",
61
+ "CQADupstackPhysicsRetrieval",
62
+ "CQADupstackProgrammersRetrieval",
63
+ "CQADupstackStatsRetrieval",
64
+ "CQADupstackTexRetrieval",
65
+ "CQADupstackUnixRetrieval",
66
+ "CQADupstackWebmastersRetrieval",
67
+ "CQADupstackWordpressRetrieval",
68
+ "DBPedia",
69
+ "FEVER",
70
+ "FiQA2018",
71
+ "HotpotQA",
72
+ "MSMARCO",
73
+ "NFCorpus",
74
+ "NQ",
75
+ "QuoraRetrieval",
76
+ "SCIDOCS",
77
+ "SciFact",
78
+ "Touche2020",
79
+ "TRECCOVID",
80
+ ]
81
+
82
+ TASK_LIST_STS = [
83
+ "BIOSSES",
84
+ "SICK-R",
85
+ "STS12",
86
+ "STS13",
87
+ "STS14",
88
+ "STS15",
89
+ "STS16",
90
+ "STS17",
91
+ "STS22",
92
+ "STSBenchmark",
93
+ "SummEval",
94
+ ]
95
+
96
+ TASK_LIST = TASK_LIST_CLASSIFICATION
97
+ + TASK_LIST_CLUSTERING
98
+ + TASK_LIST_PAIR_CLASSIFICATION
99
+ + TASK_LIST_RERANKING
100
+ + TASK_LIST_RETRIEVAL
101
+ + TASK_LIST_STS
102
+
103
+
104
+
105
+
106
+ class ClipModel:
107
+ """
108
+ This is an wrapper class for the clip embedding model.
109
+ """
110
+
111
+ def encode(self, sentences, batch_size=1, **kwargs):
112
+ """Returns a list of embeddings for the given sentences.
113
+ Args:
114
+ sentences (`List[str]`): List of sentences to encode
115
+ batch_size (`int`): Batch size for the encoding
116
+
117
+ Returns:
118
+ `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
119
+ """
120
+ embeddings = []
121
+ for i in range(0, len(sentences)):
122
+ batch = sentences[i]
123
+ try:
124
+ text = clip.tokenize(batch).to(DEVICE)[
125
+ :, :77
126
+ ] # clip.tokenize(batch).to(DEVICE)
127
+
128
+ with torch.no_grad():
129
+ text_features = MODEL.encode_text(text)
130
+
131
+ except:
132
+ print("too long token")
133
+ text = clip.tokenize(batch[: (77 * 2)]).to(DEVICE)[
134
+ :, :77
135
+ ] # clip.tokenize(batch).to(DEVICE)
136
+
137
+ with torch.no_grad():
138
+ text_features = MODEL.encode_text(text)
139
+
140
+ embeddings.append(text_features.cpu().numpy().squeeze())
141
+
142
+ return embeddings
143
+
144
+
145
+ model = ClipModel()
146
+ evaluation = MTEB(tasks=TASK_LIST, output_folder=f"results/clip/", task_langs=["en"])
147
+ evaluation.run(model)
mteb_meta.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage: python mteb_meta.py path_to_results_folder
3
+
4
+ Creates evaluation results metadata for the model card.
5
+ E.g.
6
+ ---
7
+ tags:
8
+ - mteb
9
+ model-index:
10
+ - name: SGPT-5.8B-weightedmean-msmarco-specb-bitfit
11
+ results:
12
+ - task:
13
+ type: classification
14
+ dataset:
15
+ type: mteb/banking77
16
+ name: MTEB Banking77
17
+ config: default
18
+ split: test
19
+ revision: 44fa15921b4c889113cc5df03dd4901b49161ab7
20
+ metrics:
21
+ - type: accuracy
22
+ value: 84.49350649350649
23
+ ---
24
+ """
25
+
26
+ import json
27
+ import logging
28
+ import os
29
+ import sys
30
+
31
+ from mteb import MTEB
32
+
33
+ logging.basicConfig(level=logging.INFO)
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ results_folder = sys.argv[1].strip("/")
38
+ model_name = results_folder.split("/")[-1]
39
+
40
+ all_results = {}
41
+
42
+ for file_name in os.listdir(results_folder):
43
+ if not file_name.endswith(".json"):
44
+ logger.info(f"Skipping non-json {file_name}")
45
+ continue
46
+ with open(os.path.join(results_folder, file_name), "r", encoding="utf-8") as f:
47
+ results = json.load(f)
48
+ all_results = {**all_results, **{file_name.replace(".json", ""): results}}
49
+
50
+ MARKER = "---"
51
+ TAGS = "tags:"
52
+ MTEB_TAG = "- mteb"
53
+ HEADER = "model-index:"
54
+ MODEL = f"- name: {model_name}"
55
+ RES = " results:"
56
+
57
+ META_STRING = "\n".join([MARKER, TAGS, MTEB_TAG, HEADER, MODEL, RES])
58
+
59
+
60
+ ONE_TASK = " - task:\n type: {}\n dataset:\n type: {}\n name: {}\n config: {}\n split: {}\n revision: {}\n metrics:"
61
+ ONE_METRIC = " - type: {}\n value: {}"
62
+ SKIP_KEYS = ["std", "evaluation_time", "main_score", "threshold"]
63
+
64
+ for ds_name, res_dict in sorted(all_results.items()):
65
+ mteb_desc = (
66
+ MTEB(tasks=[ds_name.replace("CQADupstackRetrieval", "CQADupstackAndroidRetrieval")])
67
+ .tasks[0]
68
+ .description
69
+ )
70
+ hf_hub_name = mteb_desc.get("hf_hub_name", mteb_desc.get("beir_name"))
71
+ if "CQADupstack" in ds_name:
72
+ hf_hub_name = "BeIR/cqadupstack"
73
+ mteb_type = mteb_desc["type"]
74
+ revision = res_dict.get("dataset_revision") # Okay if it's None
75
+ split = "test"
76
+ if ds_name == "MSMARCO":
77
+ split = "dev" if "dev" in res_dict else "validation"
78
+ if split not in res_dict:
79
+ logger.info(f"Skipping {ds_name} as split {split} not present.")
80
+ continue
81
+ res_dict = res_dict.get(split)
82
+ for lang in mteb_desc["eval_langs"]:
83
+ mteb_name = f"MTEB {ds_name}"
84
+ mteb_name += f" ({lang})" if len(mteb_desc["eval_langs"]) > 1 else ""
85
+ # For English there is no language key if it's the only language
86
+ test_result_lang = res_dict.get(lang) if len(mteb_desc["eval_langs"]) > 1 else res_dict
87
+ # Skip if the language was not found but it has other languages
88
+ if test_result_lang is None:
89
+ continue
90
+ META_STRING += "\n" + ONE_TASK.format(
91
+ mteb_type,
92
+ hf_hub_name,
93
+ mteb_name,
94
+ lang if len(mteb_desc["eval_langs"]) > 1 else "default",
95
+ split,
96
+ revision
97
+ )
98
+ for (metric, score) in test_result_lang.items():
99
+ if not isinstance(score, dict):
100
+ score = {metric: score}
101
+ for sub_metric, sub_score in score.items():
102
+ if any([x in sub_metric for x in SKIP_KEYS]):
103
+ continue
104
+ META_STRING += "\n" + ONE_METRIC.format(
105
+ f"{metric}_{sub_metric}" if metric != sub_metric else metric,
106
+ # All MTEB scores are 0-1, multiply them by 100 for 3 reasons:
107
+ # 1) It's easier to visually digest (You need two chars less: "0.1" -> "1")
108
+ # 2) Others may multiply them by 100, when building on MTEB making it confusing what the range is
109
+ # This happend with Text and Code Embeddings paper (OpenAI) vs original BEIR paper
110
+ # 3) It's accepted practice (SuperGLUE, GLUE are 0-100)
111
+ sub_score * 100,
112
+ )
113
+
114
+ META_STRING += "\n" + MARKER
115
+ if os.path.exists("./mteb_metadata.md"):
116
+ logger.warning("Overwriting mteb_metadata.md")
117
+ with open(f"./mteb_metadata.md", "w") as f:
118
+ f.write(META_STRING)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ mteb
2
+ ftfy
3
+ regex
4
+ tqdm
5
+ git+https://github.com/openai/CLIP.git