hotchpotch commited on
Commit
43226e0
1 Parent(s): 9bd0c38

Upload folder using huggingface_hub

Browse files
Files changed (46) hide show
  1. .gitattributes +1 -0
  2. 1_Pooling/config.json +10 -0
  3. README.md +133 -0
  4. config.json +25 -0
  5. config_sentence_transformers.json +10 -0
  6. jmteb_config/jmteb.jsonnet +22 -0
  7. jmteb_config/tasks/amazon_counterfactual_classification.jsonnet +32 -0
  8. jmteb_config/tasks/amazon_review_classification.jsonnet +32 -0
  9. jmteb_config/tasks/esci.jsonnet +33 -0
  10. jmteb_config/tasks/jagovfaqs_22k.jsonnet +33 -0
  11. jmteb_config/tasks/jaqket.jsonnet +33 -0
  12. jmteb_config/tasks/jsick.jsonnet +25 -0
  13. jmteb_config/tasks/jsts.jsonnet +25 -0
  14. jmteb_config/tasks/livedoor_news.jsonnet +24 -0
  15. jmteb_config/tasks/massive_intent_classification.jsonnet +32 -0
  16. jmteb_config/tasks/massive_scenario_classification.jsonnet +32 -0
  17. jmteb_config/tasks/mewsc16.jsonnet +24 -0
  18. jmteb_config/tasks/mrtydi.jsonnet +33 -0
  19. jmteb_config/tasks/nlp_journal_abs_intro.jsonnet +33 -0
  20. jmteb_config/tasks/nlp_journal_title_abs.jsonnet +33 -0
  21. jmteb_config/tasks/nlp_journal_title_intro.jsonnet +33 -0
  22. jmteb_config/tasks/paws_x_ja.jsonnet +25 -0
  23. model.safetensors +3 -0
  24. modules.json +14 -0
  25. result/Classification/scores_amazon_counterfactual_classification.json +23 -0
  26. result/Classification/scores_amazon_review_classification.json +23 -0
  27. result/Classification/scores_massive_intent_classification.json +23 -0
  28. result/Classification/scores_massive_scenario_classification.json +23 -0
  29. result/Clustering/scores_livedoor_news.json +36 -0
  30. result/Clustering/scores_mewsc16.json +36 -0
  31. result/PairClassification/scores_paws_x_ja.json +41 -0
  32. result/Reranking/scores_esci.json +31 -0
  33. result/Retrieval/scores_jagovfaqs_22k.json +43 -0
  34. result/Retrieval/scores_jaqket.json +43 -0
  35. result/Retrieval/scores_mrtydi.json +43 -0
  36. result/Retrieval/scores_nlp_journal_abs_intro.json +43 -0
  37. result/Retrieval/scores_nlp_journal_title_abs.json +43 -0
  38. result/Retrieval/scores_nlp_journal_title_intro.json +43 -0
  39. result/STS/scores_jsick.json +31 -0
  40. result/STS/scores_jsts.json +31 -0
  41. result/summary.json +62 -0
  42. sentence_bert_config.json +4 -0
  43. special_tokens_map.json +37 -0
  44. tokenizer.json +3 -0
  45. tokenizer_config.json +63 -0
  46. vocab.txt +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ja
4
+ base_model: cl-nagoya/ruri-pt-base
5
+ tags:
6
+ - sentence-similarity
7
+ - feature-extraction
8
+ license: apache-2.0
9
+ datasets:
10
+ - cl-nagoya/ruri-dataset-ft
11
+ pipeline_tag: sentence-similarity
12
+ ---
13
+
14
+ # Ruri: Japanese General Text Embeddings
15
+
16
+
17
+ ## Usage
18
+
19
+ ### Direct Usage (Sentence Transformers)
20
+
21
+ First install the Sentence Transformers library:
22
+
23
+ ```bash
24
+ pip install -U sentence-transformers fugashi sentencepiece unidic-lite
25
+ ```
26
+
27
+ Then you can load this model and run inference.
28
+ ```python
29
+ import torch.nn.functional as F
30
+ from sentence_transformers import SentenceTransformer
31
+
32
+ # Download from the 🤗 Hub
33
+ model = SentenceTransformer("cl-nagoya/ruri-base")
34
+
35
+ # Don't forget to add the prefix "クエリ: " for query-side or "文章: " for passage-side texts.
36
+ sentences = [
37
+ "クエリ: 瑠璃色はどんな色?",
38
+ "文章: 瑠璃色(るりいろ)は、紫みを帯びた濃い青。名は、半貴石の瑠璃(ラピスラズリ、英: lapis lazuli)による。JIS慣用色名では「こい紫みの青」(略号 dp-pB)と定義している[1][2]。",
39
+ "クエリ: ワシやタカのように、鋭いくちばしと爪を持った大型の鳥類を総称して「何類」というでしょう?",
40
+ "文章: ワシ、タカ、ハゲワシ、ハヤブサ、コンドル、フクロウが代表的である。これらの猛禽類はリンネ前後の時代(17~18世紀)には鷲類・鷹類・隼類及び梟類に分類された。ちなみにリンネは狩りをする鳥を単一の目(もく)にまとめ、vultur(コンドル、ハゲワシ)、falco(ワシ、タカ、ハヤブサなど)、strix(フクロウ)、lanius(モズ)の4属を含めている。",
41
+ ]
42
+
43
+ embeddings = model.encode(sentences, convert_to_tensor=True)
44
+ print(embeddings.size())
45
+ # [4, 768]
46
+
47
+ similarities = F.cosine_similarity(embeddings.unsqueeze(0), embeddings.unsqueeze(1), dim=2)
48
+ print(similarities)
49
+ # [[1.0000, 0.9421, 0.6844, 0.7167],
50
+ # [0.9421, 1.0000, 0.6626, 0.6863],
51
+ # [0.6844, 0.6626, 1.0000, 0.8785],
52
+ # [0.7167, 0.6863, 0.8785, 1.0000]]
53
+ ```
54
+
55
+ ## Benchmarks
56
+
57
+ ### JMTEB
58
+ Evaluated with [JMTEB](https://github.com/sbintuitions/JMTEB).
59
+
60
+ |Model|#Param.|Avg.|Retrieval|STS|Classfification|Reranking|Clustering|PairClassification|
61
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
62
+ |[cl-nagoya/sup-simcse-ja-base](https://huggingface.co/cl-nagoya/sup-simcse-ja-base)|111M|68.56|49.64|82.05|73.47|91.83|51.79|62.57|
63
+ |[cl-nagoya/sup-simcse-ja-large](https://huggingface.co/cl-nagoya/sup-simcse-ja-large)|337M|66.51|37.62|83.18|73.73|91.48|50.56|62.51|
64
+ |[cl-nagoya/unsup-simcse-ja-base](https://huggingface.co/cl-nagoya/unsup-simcse-ja-base)|111M|65.07|40.23|78.72|73.07|91.16|44.77|62.44|
65
+ |[cl-nagoya/unsup-simcse-ja-large](https://huggingface.co/cl-nagoya/unsup-simcse-ja-large)|337M|66.27|40.53|80.56|74.66|90.95|48.41|62.49|
66
+ |[pkshatech/GLuCoSE-base-ja](https://huggingface.co/pkshatech/GLuCoSE-base-ja)|133M|70.44|59.02|78.71|76.82|91.90|49.78|66.39|
67
+ ||||||||||
68
+ |[sentence-transformers/LaBSE](https://huggingface.co/sentence-transformers/LaBSE)|472M|64.70|40.12|76.56|72.66|91.63|44.88|62.33|
69
+ |[intfloat/multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small)|118M|69.52|67.27|80.07|67.62|93.03|46.91|62.19|
70
+ |[intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base)|278M|70.12|68.21|79.84|69.30|92.85|48.26|62.26|
71
+ |[intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)|560M|71.65|70.98|79.70|72.89|92.96|51.24|62.15|
72
+ ||||||||||
73
+ |OpenAI/text-embedding-ada-002|-|69.48|64.38|79.02|69.75|93.04|48.30|62.40|
74
+ |OpenAI/text-embedding-3-small|-|70.86|66.39|79.46|73.06|92.92|51.06|62.27|
75
+ |OpenAI/text-embedding-3-large|-|73.97|74.48|82.52|77.58|93.58|53.32|62.35|
76
+ ||||||||||
77
+ |[Ruri-Small](https://huggingface.co/cl-nagoya/ruri-small)|68M|71.53|69.41|82.79|76.22|93.00|51.19|62.11|
78
+ |[**Ruri-Base**](https://huggingface.co/cl-nagoya/ruri-base) (this model)|111M|71.91|69.82|82.87|75.58|92.91|54.16|62.38|
79
+ |[Ruri-Large](https://huggingface.co/cl-nagoya/ruri-large)|337M|73.31|73.02|83.13|77.43|92.99|51.82|62.29|
80
+
81
+
82
+
83
+
84
+
85
+ ## Model Details
86
+
87
+ ### Model Description
88
+ - **Model Type:** Sentence Transformer
89
+ - **Base model:** [cl-nagoya/ruri-pt-base](https://huggingface.co/cl-nagoya/ruri-pt-base)
90
+ - **Maximum Sequence Length:** 512 tokens
91
+ - **Output Dimensionality:** 768
92
+ - **Similarity Function:** Cosine Similarity
93
+ - **Language:** Japanese
94
+ - **License:** Apache 2.0
95
+ - **Paper:** https://arxiv.org/abs/2409.07737
96
+ <!-- - **Training Dataset:** Unknown -->
97
+
98
+
99
+ ### Full Model Architecture
100
+
101
+ ```
102
+ SentenceTransformer(
103
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
104
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
105
+ )
106
+ ```
107
+
108
+ ### Framework Versions
109
+ - Python: 3.10.13
110
+ - Sentence Transformers: 3.0.0
111
+ - Transformers: 4.41.2
112
+ - PyTorch: 2.3.1+cu118
113
+ - Accelerate: 0.30.1
114
+ - Datasets: 2.19.1
115
+ - Tokenizers: 0.19.1
116
+
117
+ ## Citation
118
+
119
+ ```bibtex
120
+ @misc{
121
+ Ruri,
122
+ title={{Ruri: Japanese General Text Embeddings}},
123
+ author={Hayato Tsukagoshi and Ryohei Sasano},
124
+ year={2024},
125
+ eprint={2409.07737},
126
+ archivePrefix={arXiv},
127
+ primaryClass={cs.CL},
128
+ url={https://arxiv.org/abs/2409.07737},
129
+ }
130
+ ```
131
+
132
+ ## License
133
+ This model is published under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cl-nagoya/ruri-base-pt",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "bfloat16",
21
+ "transformers_version": "4.41.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 32768
25
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.0",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.3.1+cu118"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
jmteb_config/jmteb.jsonnet ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Classification
2
+ (import './tasks/amazon_review_classification.jsonnet') +
3
+ (import './tasks/amazon_counterfactual_classification.jsonnet') +
4
+ (import './tasks/massive_intent_classification.jsonnet') +
5
+ (import './tasks/massive_scenario_classification.jsonnet') +
6
+ // Clustering
7
+ (import './tasks/livedoor_news.jsonnet') +
8
+ (import './tasks/mewsc16.jsonnet') +
9
+ // STS
10
+ (import './tasks/jsts.jsonnet') +
11
+ (import './tasks/jsick.jsonnet') +
12
+ // Pair Classification
13
+ (import './tasks/paws_x_ja.jsonnet') +
14
+ // Retrieval
15
+ (import './tasks/jagovfaqs_22k.jsonnet') +
16
+ (import './tasks/mrtydi.jsonnet') +
17
+ (import './tasks/jaqket.jsonnet') +
18
+ (import './tasks/nlp_journal_title_abs.jsonnet') +
19
+ (import './tasks/nlp_journal_title_intro.jsonnet') +
20
+ (import './tasks/nlp_journal_abs_intro.jsonnet') +
21
+ // Reranking
22
+ (import './tasks/esci.jsonnet')
jmteb_config/tasks/amazon_counterfactual_classification.jsonnet ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ amazon_counterfactual_classification: {
3
+ class_path: 'ClassificationEvaluator',
4
+ init_args: {
5
+ prefix: 'クエリ: ',
6
+ train_dataset: {
7
+ class_path: 'HfClassificationDataset',
8
+ init_args: {
9
+ path: 'sbintuitions/JMTEB',
10
+ split: 'train',
11
+ name: 'amazon_counterfactual_classification',
12
+ },
13
+ },
14
+ val_dataset: {
15
+ class_path: 'HfClassificationDataset',
16
+ init_args: {
17
+ path: 'sbintuitions/JMTEB',
18
+ split: 'validation',
19
+ name: 'amazon_counterfactual_classification',
20
+ },
21
+ },
22
+ test_dataset: {
23
+ class_path: 'HfClassificationDataset',
24
+ init_args: {
25
+ path: 'sbintuitions/JMTEB',
26
+ split: 'test',
27
+ name: 'amazon_counterfactual_classification',
28
+ },
29
+ },
30
+ },
31
+ },
32
+ }
jmteb_config/tasks/amazon_review_classification.jsonnet ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ amazon_review_classification: {
3
+ class_path: 'ClassificationEvaluator',
4
+ init_args: {
5
+ prefix: 'クエリ: ',
6
+ train_dataset: {
7
+ class_path: 'HfClassificationDataset',
8
+ init_args: {
9
+ path: 'sbintuitions/JMTEB',
10
+ split: 'train',
11
+ name: 'amazon_review_classification',
12
+ },
13
+ },
14
+ val_dataset: {
15
+ class_path: 'HfClassificationDataset',
16
+ init_args: {
17
+ path: 'sbintuitions/JMTEB',
18
+ split: 'validation',
19
+ name: 'amazon_review_classification',
20
+ },
21
+ },
22
+ test_dataset: {
23
+ class_path: 'HfClassificationDataset',
24
+ init_args: {
25
+ path: 'sbintuitions/JMTEB',
26
+ split: 'test',
27
+ name: 'amazon_review_classification',
28
+ },
29
+ },
30
+ },
31
+ },
32
+ }
jmteb_config/tasks/esci.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ esci: {
3
+ class_path: 'RerankingEvaluator',
4
+ init_args: {
5
+ doc_prefix: '文章: ',
6
+ query_prefix: 'クエリ: ',
7
+ val_query_dataset: {
8
+ class_path: 'HfRerankingQueryDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'esci-query',
13
+ },
14
+ },
15
+ test_query_dataset: {
16
+ class_path: 'HfRerankingQueryDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'esci-query',
21
+ },
22
+ },
23
+ doc_dataset: {
24
+ class_path: 'HfRerankingDocDataset',
25
+ init_args: {
26
+ path: 'sbintuitions/JMTEB',
27
+ split: 'corpus',
28
+ name: 'esci-corpus',
29
+ },
30
+ },
31
+ },
32
+ },
33
+ }
jmteb_config/tasks/jagovfaqs_22k.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ jagovfaqs_22k: {
3
+ class_path: 'RetrievalEvaluator',
4
+ init_args: {
5
+ doc_prefix: '文章: ',
6
+ query_prefix: 'クエリ: ',
7
+ val_query_dataset: {
8
+ class_path: 'HfRetrievalQueryDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'jagovfaqs_22k-query',
13
+ },
14
+ },
15
+ test_query_dataset: {
16
+ class_path: 'HfRetrievalQueryDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'jagovfaqs_22k-query',
21
+ },
22
+ },
23
+ doc_dataset: {
24
+ class_path: 'HfRetrievalDocDataset',
25
+ init_args: {
26
+ path: 'sbintuitions/JMTEB',
27
+ split: 'corpus',
28
+ name: 'jagovfaqs_22k-corpus',
29
+ },
30
+ },
31
+ },
32
+ },
33
+ }
jmteb_config/tasks/jaqket.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ jaqket: {
3
+ class_path: 'RetrievalEvaluator',
4
+ init_args: {
5
+ doc_prefix: '文章: ',
6
+ query_prefix: 'クエリ: ',
7
+ val_query_dataset: {
8
+ class_path: 'HfRetrievalQueryDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'jaqket-query',
13
+ },
14
+ },
15
+ test_query_dataset: {
16
+ class_path: 'HfRetrievalQueryDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'jaqket-query',
21
+ },
22
+ },
23
+ doc_dataset: {
24
+ class_path: 'HfRetrievalDocDataset',
25
+ init_args: {
26
+ path: 'sbintuitions/JMTEB',
27
+ split: 'corpus',
28
+ name: 'jaqket-corpus',
29
+ },
30
+ },
31
+ },
32
+ },
33
+ }
jmteb_config/tasks/jsick.jsonnet ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ jsick: {
3
+ class_path: 'STSEvaluator',
4
+ init_args: {
5
+ sentence1_prefix: 'クエリ: ',
6
+ sentence2_prefix: 'クエリ: ',
7
+ val_dataset: {
8
+ class_path: 'HfSTSDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'jsick',
13
+ },
14
+ },
15
+ test_dataset: {
16
+ class_path: 'HfSTSDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'jsick',
21
+ },
22
+ },
23
+ },
24
+ },
25
+ }
jmteb_config/tasks/jsts.jsonnet ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ jsts: {
3
+ class_path: 'STSEvaluator',
4
+ init_args: {
5
+ sentence1_prefix: 'クエリ: ',
6
+ sentence2_prefix: 'クエリ: ',
7
+ val_dataset: {
8
+ class_path: 'HfSTSDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'train',
12
+ name: 'jsts',
13
+ },
14
+ },
15
+ test_dataset: {
16
+ class_path: 'HfSTSDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'jsts',
21
+ },
22
+ },
23
+ },
24
+ },
25
+ }
jmteb_config/tasks/livedoor_news.jsonnet ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ livedoor_news: {
3
+ class_path: 'ClusteringEvaluator',
4
+ init_args: {
5
+ prefix: 'クエリ: ',
6
+ val_dataset: {
7
+ class_path: 'HfClusteringDataset',
8
+ init_args: {
9
+ path: 'sbintuitions/JMTEB',
10
+ split: 'validation',
11
+ name: 'livedoor_news',
12
+ },
13
+ },
14
+ test_dataset: {
15
+ class_path: 'HfClusteringDataset',
16
+ init_args: {
17
+ path: 'sbintuitions/JMTEB',
18
+ split: 'test',
19
+ name: 'livedoor_news',
20
+ },
21
+ },
22
+ },
23
+ },
24
+ }
jmteb_config/tasks/massive_intent_classification.jsonnet ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ massive_intent_classification: {
3
+ class_path: 'ClassificationEvaluator',
4
+ init_args: {
5
+ prefix: 'クエリ: ',
6
+ train_dataset: {
7
+ class_path: 'HfClassificationDataset',
8
+ init_args: {
9
+ path: 'sbintuitions/JMTEB',
10
+ split: 'train',
11
+ name: 'massive_intent_classification',
12
+ },
13
+ },
14
+ val_dataset: {
15
+ class_path: 'HfClassificationDataset',
16
+ init_args: {
17
+ path: 'sbintuitions/JMTEB',
18
+ split: 'validation',
19
+ name: 'massive_intent_classification',
20
+ },
21
+ },
22
+ test_dataset: {
23
+ class_path: 'HfClassificationDataset',
24
+ init_args: {
25
+ path: 'sbintuitions/JMTEB',
26
+ split: 'test',
27
+ name: 'massive_intent_classification',
28
+ },
29
+ },
30
+ },
31
+ },
32
+ }
jmteb_config/tasks/massive_scenario_classification.jsonnet ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ massive_scenario_classification: {
3
+ class_path: 'ClassificationEvaluator',
4
+ init_args: {
5
+ prefix: 'クエリ: ',
6
+ train_dataset: {
7
+ class_path: 'HfClassificationDataset',
8
+ init_args: {
9
+ path: 'sbintuitions/JMTEB',
10
+ split: 'train',
11
+ name: 'massive_scenario_classification',
12
+ },
13
+ },
14
+ val_dataset: {
15
+ class_path: 'HfClassificationDataset',
16
+ init_args: {
17
+ path: 'sbintuitions/JMTEB',
18
+ split: 'validation',
19
+ name: 'massive_scenario_classification',
20
+ },
21
+ },
22
+ test_dataset: {
23
+ class_path: 'HfClassificationDataset',
24
+ init_args: {
25
+ path: 'sbintuitions/JMTEB',
26
+ split: 'test',
27
+ name: 'massive_scenario_classification',
28
+ },
29
+ },
30
+ },
31
+ },
32
+ }
jmteb_config/tasks/mewsc16.jsonnet ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ mewsc16: {
3
+ class_path: 'ClusteringEvaluator',
4
+ init_args: {
5
+ prefix: 'クエリ: ',
6
+ val_dataset: {
7
+ class_path: 'HfClusteringDataset',
8
+ init_args: {
9
+ path: 'sbintuitions/JMTEB',
10
+ split: 'validation',
11
+ name: 'mewsc16_ja',
12
+ },
13
+ },
14
+ test_dataset: {
15
+ class_path: 'HfClusteringDataset',
16
+ init_args: {
17
+ path: 'sbintuitions/JMTEB',
18
+ split: 'test',
19
+ name: 'mewsc16_ja',
20
+ },
21
+ },
22
+ },
23
+ },
24
+ }
jmteb_config/tasks/mrtydi.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ mrtydi: {
3
+ class_path: 'RetrievalEvaluator',
4
+ init_args: {
5
+ doc_prefix: '文章: ',
6
+ query_prefix: 'クエリ: ',
7
+ val_query_dataset: {
8
+ class_path: 'HfRetrievalQueryDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'mrtydi-query',
13
+ },
14
+ },
15
+ test_query_dataset: {
16
+ class_path: 'HfRetrievalQueryDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'mrtydi-query',
21
+ },
22
+ },
23
+ doc_dataset: {
24
+ class_path: 'HfRetrievalDocDataset',
25
+ init_args: {
26
+ path: 'sbintuitions/JMTEB',
27
+ split: 'corpus',
28
+ name: 'mrtydi-corpus',
29
+ },
30
+ },
31
+ },
32
+ },
33
+ }
jmteb_config/tasks/nlp_journal_abs_intro.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ nlp_journal_abs_intro: {
3
+ class_path: 'RetrievalEvaluator',
4
+ init_args: {
5
+ doc_prefix: '文章: ',
6
+ query_prefix: '文章: ',
7
+ val_query_dataset: {
8
+ class_path: 'HfRetrievalQueryDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'nlp_journal_abs_intro-query',
13
+ },
14
+ },
15
+ test_query_dataset: {
16
+ class_path: 'HfRetrievalQueryDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'nlp_journal_abs_intro-query',
21
+ },
22
+ },
23
+ doc_dataset: {
24
+ class_path: 'HfRetrievalDocDataset',
25
+ init_args: {
26
+ path: 'sbintuitions/JMTEB',
27
+ split: 'corpus',
28
+ name: 'nlp_journal_abs_intro-corpus',
29
+ },
30
+ },
31
+ },
32
+ },
33
+ }
jmteb_config/tasks/nlp_journal_title_abs.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ nlp_journal_title_abs: {
3
+ class_path: 'RetrievalEvaluator',
4
+ init_args: {
5
+ doc_prefix: '文章: ',
6
+ query_prefix: 'クエリ: ',
7
+ val_query_dataset: {
8
+ class_path: 'HfRetrievalQueryDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'nlp_journal_title_abs-query',
13
+ },
14
+ },
15
+ test_query_dataset: {
16
+ class_path: 'HfRetrievalQueryDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'nlp_journal_title_abs-query',
21
+ },
22
+ },
23
+ doc_dataset: {
24
+ class_path: 'HfRetrievalDocDataset',
25
+ init_args: {
26
+ path: 'sbintuitions/JMTEB',
27
+ split: 'corpus',
28
+ name: 'nlp_journal_title_abs-corpus',
29
+ },
30
+ },
31
+ },
32
+ },
33
+ }
jmteb_config/tasks/nlp_journal_title_intro.jsonnet ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ nlp_journal_title_intro: {
3
+ class_path: 'RetrievalEvaluator',
4
+ init_args: {
5
+ doc_prefix: '文章: ',
6
+ query_prefix: 'クエリ: ',
7
+ val_query_dataset: {
8
+ class_path: 'HfRetrievalQueryDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'nlp_journal_title_intro-query',
13
+ },
14
+ },
15
+ test_query_dataset: {
16
+ class_path: 'HfRetrievalQueryDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'nlp_journal_title_intro-query',
21
+ },
22
+ },
23
+ doc_dataset: {
24
+ class_path: 'HfRetrievalDocDataset',
25
+ init_args: {
26
+ path: 'sbintuitions/JMTEB',
27
+ split: 'corpus',
28
+ name: 'nlp_journal_title_intro-corpus',
29
+ },
30
+ },
31
+ },
32
+ },
33
+ }
jmteb_config/tasks/paws_x_ja.jsonnet ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ paws_x_ja: {
3
+ class_path: 'PairClassificationEvaluator',
4
+ init_args: {
5
+ sentence1_prefix: 'クエリ: ',
6
+ sentence2_prefix: 'クエリ: ',
7
+ val_dataset: {
8
+ class_path: 'HfPairClassificationDataset',
9
+ init_args: {
10
+ path: 'sbintuitions/JMTEB',
11
+ split: 'validation',
12
+ name: 'paws_x_ja',
13
+ },
14
+ },
15
+ test_dataset: {
16
+ class_path: 'HfPairClassificationDataset',
17
+ init_args: {
18
+ path: 'sbintuitions/JMTEB',
19
+ split: 'test',
20
+ name: 'paws_x_ja',
21
+ },
22
+ },
23
+ },
24
+ },
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e863b345faba4d2fb42a8bf0270e52c3391225d6eb08f213046d46caddba6df
3
+ size 222436792
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
result/Classification/scores_amazon_counterfactual_classification.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "macro_f1",
3
+ "metric_value": 0.7665550732749669,
4
+ "details": {
5
+ "optimal_classifier_name": "logreg",
6
+ "val_scores": {
7
+ "knn_cosine_k_2": {
8
+ "accuracy": 0.9098712446351931,
9
+ "macro_f1": 0.6139035745285253
10
+ },
11
+ "logreg": {
12
+ "accuracy": 0.9206008583690987,
13
+ "macro_f1": 0.7381028328396749
14
+ }
15
+ },
16
+ "test_scores": {
17
+ "logreg": {
18
+ "accuracy": 0.923982869379015,
19
+ "macro_f1": 0.7665550732749669
20
+ }
21
+ }
22
+ }
23
+ }
result/Classification/scores_amazon_review_classification.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "macro_f1",
3
+ "metric_value": 0.5575876111411316,
4
+ "details": {
5
+ "optimal_classifier_name": "logreg",
6
+ "val_scores": {
7
+ "knn_cosine_k_2": {
8
+ "accuracy": 0.4314,
9
+ "macro_f1": 0.4209604852624187
10
+ },
11
+ "logreg": {
12
+ "accuracy": 0.5702,
13
+ "macro_f1": 0.5653832808449197
14
+ }
15
+ },
16
+ "test_scores": {
17
+ "logreg": {
18
+ "accuracy": 0.562,
19
+ "macro_f1": 0.5575876111411316
20
+ }
21
+ }
22
+ }
23
+ }
result/Classification/scores_massive_intent_classification.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "macro_f1",
3
+ "metric_value": 0.8141210121425055,
4
+ "details": {
5
+ "optimal_classifier_name": "logreg",
6
+ "val_scores": {
7
+ "knn_cosine_k_2": {
8
+ "accuracy": 0.7757009345794392,
9
+ "macro_f1": 0.7456574019302791
10
+ },
11
+ "logreg": {
12
+ "accuracy": 0.8421052631578947,
13
+ "macro_f1": 0.8271757887821682
14
+ }
15
+ },
16
+ "test_scores": {
17
+ "logreg": {
18
+ "accuracy": 0.8416274377942166,
19
+ "macro_f1": 0.8141210121425055
20
+ }
21
+ }
22
+ }
23
+ }
result/Classification/scores_massive_scenario_classification.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "macro_f1",
3
+ "metric_value": 0.8848812917656395,
4
+ "details": {
5
+ "optimal_classifier_name": "logreg",
6
+ "val_scores": {
7
+ "knn_cosine_k_2": {
8
+ "accuracy": 0.8657156910969012,
9
+ "macro_f1": 0.8581068338871749
10
+ },
11
+ "logreg": {
12
+ "accuracy": 0.8898180029513035,
13
+ "macro_f1": 0.887764836229313
14
+ }
15
+ },
16
+ "test_scores": {
17
+ "logreg": {
18
+ "accuracy": 0.8860121049092132,
19
+ "macro_f1": 0.8848812917656395
20
+ }
21
+ }
22
+ }
23
+ }
result/Clustering/scores_livedoor_news.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "v_measure_score",
3
+ "metric_value": 0.5427223607801758,
4
+ "details": {
5
+ "optimal_clustering_model_name": "BisectingKMeans",
6
+ "val_scores": {
7
+ "MiniBatchKMeans": {
8
+ "v_measure_score": 0.5453092926343514,
9
+ "homogeneity_score": 0.5376167786682042,
10
+ "completeness_score": 0.5532251395371498
11
+ },
12
+ "AgglomerativeClustering": {
13
+ "v_measure_score": 0.5221218542278205,
14
+ "homogeneity_score": 0.5145096860981694,
15
+ "completeness_score": 0.5299626488611732
16
+ },
17
+ "BisectingKMeans": {
18
+ "v_measure_score": 0.5498693214751904,
19
+ "homogeneity_score": 0.5475063196854639,
20
+ "completeness_score": 0.552252808804315
21
+ },
22
+ "Birch": {
23
+ "v_measure_score": 0.5208037508658081,
24
+ "homogeneity_score": 0.5132767763409753,
25
+ "completeness_score": 0.5285547703444661
26
+ }
27
+ },
28
+ "test_scores": {
29
+ "BisectingKMeans": {
30
+ "v_measure_score": 0.5427223607801758,
31
+ "homogeneity_score": 0.5417341205522448,
32
+ "completeness_score": 0.5437142131253088
33
+ }
34
+ }
35
+ }
36
+ }
result/Clustering/scores_mewsc16.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "v_measure_score",
3
+ "metric_value": 0.5404099864321413,
4
+ "details": {
5
+ "optimal_clustering_model_name": "AgglomerativeClustering",
6
+ "val_scores": {
7
+ "MiniBatchKMeans": {
8
+ "v_measure_score": 0.502791381026052,
9
+ "homogeneity_score": 0.5517784337158165,
10
+ "completeness_score": 0.46179324043437603
11
+ },
12
+ "AgglomerativeClustering": {
13
+ "v_measure_score": 0.5302546097654716,
14
+ "homogeneity_score": 0.5735135314580632,
15
+ "completeness_score": 0.4930638394517115
16
+ },
17
+ "BisectingKMeans": {
18
+ "v_measure_score": 0.48656257334532493,
19
+ "homogeneity_score": 0.5342920872487864,
20
+ "completeness_score": 0.4466613135580361
21
+ },
22
+ "Birch": {
23
+ "v_measure_score": 0.49305647750510134,
24
+ "homogeneity_score": 0.5374392451928177,
25
+ "completeness_score": 0.45544495608862656
26
+ }
27
+ },
28
+ "test_scores": {
29
+ "AgglomerativeClustering": {
30
+ "v_measure_score": 0.5404099864321413,
31
+ "homogeneity_score": 0.5789428395923124,
32
+ "completeness_score": 0.5066863291321174
33
+ }
34
+ }
35
+ }
36
+ }
result/PairClassification/scores_paws_x_ja.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "binary_f1",
3
+ "metric_value": 0.6237623762376238,
4
+ "details": {
5
+ "optimal_distance_metric": "euclidean_distances",
6
+ "val_scores": {
7
+ "cosine_distances": {
8
+ "accuracy": 0.5725,
9
+ "accuracy_threshold": 0.6920696496963501,
10
+ "binary_f1": 0.5979670522257273,
11
+ "binary_f1_threshold": 1.0
12
+ },
13
+ "manhatten_distances": {
14
+ "accuracy": 0.6015,
15
+ "accuracy_threshold": 19.63576316833496,
16
+ "binary_f1": 0.6017636684303351,
17
+ "binary_f1_threshold": 274.46441650390625
18
+ },
19
+ "euclidean_distances": {
20
+ "accuracy": 0.602,
21
+ "accuracy_threshold": 0.9731899499893188,
22
+ "binary_f1": 0.6019760056457304,
23
+ "binary_f1_threshold": 12.281266212463379
24
+ },
25
+ "dot_similarities": {
26
+ "accuracy": 0.574,
27
+ "accuracy_threshold": 332.39276123046875,
28
+ "binary_f1": 0.6014825273561596,
29
+ "binary_f1_threshold": 263.39337158203125
30
+ }
31
+ },
32
+ "test_scores": {
33
+ "euclidean_distances": {
34
+ "accuracy": 0.566,
35
+ "accuracy_threshold": 0.9731899499893188,
36
+ "binary_f1": 0.6237623762376238,
37
+ "binary_f1_threshold": 12.281266212463379
38
+ }
39
+ }
40
+ }
41
+ }
result/Reranking/scores_esci.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "ndcg@10",
3
+ "metric_value": 0.9290942178703699,
4
+ "details": {
5
+ "optimal_distance_metric": "cosine_similarity",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "ndcg@10": 0.9419326097489188,
9
+ "ndcg@20": 0.9546274758967366,
10
+ "ndcg@40": 0.9625015652058491
11
+ },
12
+ "dot_score": {
13
+ "ndcg@10": 0.933159692803982,
14
+ "ndcg@20": 0.9482607249371672,
15
+ "ndcg@40": 0.956621759096631
16
+ },
17
+ "euclidean_distance": {
18
+ "ndcg@10": 0.9418339438093611,
19
+ "ndcg@20": 0.9547832679237122,
20
+ "ndcg@40": 0.9627457241783169
21
+ }
22
+ },
23
+ "test_scores": {
24
+ "cosine_similarity": {
25
+ "ndcg@10": 0.9290942178703699,
26
+ "ndcg@20": 0.9467035648480672,
27
+ "ndcg@40": 0.9563220304481116
28
+ }
29
+ }
30
+ }
31
+ }
result/Retrieval/scores_jagovfaqs_22k.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "ndcg@10",
3
+ "metric_value": 0.7455660589538348,
4
+ "details": {
5
+ "optimal_distance_metric": "euclidean_distance",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "accuracy@1": 0.6042702544603685,
9
+ "accuracy@3": 0.7853173442527055,
10
+ "accuracy@5": 0.830944720678561,
11
+ "accuracy@10": 0.8821292775665399,
12
+ "ndcg@10": 0.7477862730518441,
13
+ "mrr@10": 0.7043207426287267
14
+ },
15
+ "dot_score": {
16
+ "accuracy@1": 0.4597835624451594,
17
+ "accuracy@3": 0.6607195086282539,
18
+ "accuracy@5": 0.7282831237203861,
19
+ "accuracy@10": 0.80549868382568,
20
+ "ndcg@10": 0.630976061323317,
21
+ "mrr@10": 0.5752777429583498
22
+ },
23
+ "euclidean_distance": {
24
+ "accuracy@1": 0.6092424685580579,
25
+ "accuracy@3": 0.7861947937993565,
26
+ "accuracy@5": 0.8283123720386077,
27
+ "accuracy@10": 0.8780345130155016,
28
+ "ndcg@10": 0.7480985513112418,
29
+ "mrr@10": 0.7060561428432148
30
+ }
31
+ },
32
+ "test_scores": {
33
+ "euclidean_distance": {
34
+ "accuracy@1": 0.6035087719298246,
35
+ "accuracy@3": 0.7795321637426901,
36
+ "accuracy@5": 0.8277777777777777,
37
+ "accuracy@10": 0.881578947368421,
38
+ "ndcg@10": 0.7455660589538348,
39
+ "mrr@10": 0.7017308317089019
40
+ }
41
+ }
42
+ }
43
+ }
result/Retrieval/scores_jaqket.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "ndcg@10",
3
+ "metric_value": 0.5012253145754781,
4
+ "details": {
5
+ "optimal_distance_metric": "cosine_similarity",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "accuracy@1": 0.3407035175879397,
9
+ "accuracy@3": 0.521608040201005,
10
+ "accuracy@5": 0.6040201005025125,
11
+ "accuracy@10": 0.6894472361809045,
12
+ "ndcg@10": 0.5074962109064866,
13
+ "mrr@10": 0.44994017707585504
14
+ },
15
+ "dot_score": {
16
+ "accuracy@1": 0.31055276381909547,
17
+ "accuracy@3": 0.507537688442211,
18
+ "accuracy@5": 0.5738693467336683,
19
+ "accuracy@10": 0.6804020100502512,
20
+ "ndcg@10": 0.48656131133927916,
21
+ "mrr@10": 0.42555116854111785
22
+ },
23
+ "euclidean_distance": {
24
+ "accuracy@1": 0.3055276381909548,
25
+ "accuracy@3": 0.4814070351758794,
26
+ "accuracy@5": 0.5597989949748744,
27
+ "accuracy@10": 0.6391959798994975,
28
+ "ndcg@10": 0.4655083260444005,
29
+ "mrr@10": 0.4106070032703195
30
+ }
31
+ },
32
+ "test_scores": {
33
+ "cosine_similarity": {
34
+ "accuracy@1": 0.3159478435305918,
35
+ "accuracy@3": 0.526579739217653,
36
+ "accuracy@5": 0.60481444332999,
37
+ "accuracy@10": 0.6920762286860582,
38
+ "ndcg@10": 0.5012253145754781,
39
+ "mrr@10": 0.4404156915190016
40
+ }
41
+ }
42
+ }
43
+ }
result/Retrieval/scores_mrtydi.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "ndcg@10",
3
+ "metric_value": 0.3545113073009125,
4
+ "details": {
5
+ "optimal_distance_metric": "euclidean_distance",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "accuracy@1": 0.22306034482758622,
9
+ "accuracy@3": 0.37176724137931033,
10
+ "accuracy@5": 0.4536637931034483,
11
+ "accuracy@10": 0.5549568965517241,
12
+ "ndcg@10": 0.37815020333355365,
13
+ "mrr@10": 0.3228995621236997
14
+ },
15
+ "dot_score": {
16
+ "accuracy@1": 0.13793103448275862,
17
+ "accuracy@3": 0.2704741379310345,
18
+ "accuracy@5": 0.3394396551724138,
19
+ "accuracy@10": 0.4170258620689655,
20
+ "ndcg@10": 0.2698064952674162,
21
+ "mrr@10": 0.22368979200875752
22
+ },
23
+ "euclidean_distance": {
24
+ "accuracy@1": 0.22844827586206898,
25
+ "accuracy@3": 0.38362068965517243,
26
+ "accuracy@5": 0.4665948275862069,
27
+ "accuracy@10": 0.5668103448275862,
28
+ "ndcg@10": 0.38745306818571434,
29
+ "mrr@10": 0.33128378147235893
30
+ }
31
+ },
32
+ "test_scores": {
33
+ "euclidean_distance": {
34
+ "accuracy@1": 0.23194444444444445,
35
+ "accuracy@3": 0.3888888888888889,
36
+ "accuracy@5": 0.46805555555555556,
37
+ "accuracy@10": 0.5708333333333333,
38
+ "ndcg@10": 0.3545113073009125,
39
+ "mrr@10": 0.3320238095238095
40
+ }
41
+ }
42
+ }
43
+ }
result/Retrieval/scores_nlp_journal_abs_intro.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "ndcg@10",
3
+ "metric_value": 0.8689204088388403,
4
+ "details": {
5
+ "optimal_distance_metric": "cosine_similarity",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "accuracy@1": 0.85,
9
+ "accuracy@3": 0.93,
10
+ "accuracy@5": 0.93,
11
+ "accuracy@10": 0.95,
12
+ "ndcg@10": 0.9031188595062929,
13
+ "mrr@10": 0.8877777777777779
14
+ },
15
+ "dot_score": {
16
+ "accuracy@1": 0.75,
17
+ "accuracy@3": 0.87,
18
+ "accuracy@5": 0.88,
19
+ "accuracy@10": 0.91,
20
+ "ndcg@10": 0.8329701303885662,
21
+ "mrr@10": 0.8079563492063491
22
+ },
23
+ "euclidean_distance": {
24
+ "accuracy@1": 0.83,
25
+ "accuracy@3": 0.92,
26
+ "accuracy@5": 0.93,
27
+ "accuracy@10": 0.94,
28
+ "ndcg@10": 0.8903171995628786,
29
+ "mrr@10": 0.87375
30
+ }
31
+ },
32
+ "test_scores": {
33
+ "cosine_similarity": {
34
+ "accuracy@1": 0.7945544554455446,
35
+ "accuracy@3": 0.8836633663366337,
36
+ "accuracy@5": 0.9084158415841584,
37
+ "accuracy@10": 0.943069306930693,
38
+ "ndcg@10": 0.8689204088388403,
39
+ "mrr@10": 0.8452508643721514
40
+ }
41
+ }
42
+ }
43
+ }
result/Retrieval/scores_nlp_journal_title_abs.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "ndcg@10",
3
+ "metric_value": 0.9656989703684407,
4
+ "details": {
5
+ "optimal_distance_metric": "cosine_similarity",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "accuracy@1": 0.9,
9
+ "accuracy@3": 0.96,
10
+ "accuracy@5": 0.98,
11
+ "accuracy@10": 0.99,
12
+ "ndcg@10": 0.9477320812882918,
13
+ "mrr@10": 0.9339444444444445
14
+ },
15
+ "dot_score": {
16
+ "accuracy@1": 0.82,
17
+ "accuracy@3": 0.92,
18
+ "accuracy@5": 0.94,
19
+ "accuracy@10": 0.96,
20
+ "ndcg@10": 0.8940025955079818,
21
+ "mrr@10": 0.8724285714285713
22
+ },
23
+ "euclidean_distance": {
24
+ "accuracy@1": 0.89,
25
+ "accuracy@3": 0.97,
26
+ "accuracy@5": 0.98,
27
+ "accuracy@10": 0.99,
28
+ "ndcg@10": 0.9453171995628784,
29
+ "mrr@10": 0.9304166666666666
30
+ }
31
+ },
32
+ "test_scores": {
33
+ "cosine_similarity": {
34
+ "accuracy@1": 0.9306930693069307,
35
+ "accuracy@3": 0.9777227722772277,
36
+ "accuracy@5": 0.9876237623762376,
37
+ "accuracy@10": 0.995049504950495,
38
+ "ndcg@10": 0.9656989703684407,
39
+ "mrr@10": 0.955987741631306
40
+ }
41
+ }
42
+ }
43
+ }
result/Retrieval/scores_nlp_journal_title_intro.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "ndcg@10",
3
+ "metric_value": 0.7531306059721564,
4
+ "details": {
5
+ "optimal_distance_metric": "cosine_similarity",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "accuracy@1": 0.57,
9
+ "accuracy@3": 0.8,
10
+ "accuracy@5": 0.83,
11
+ "accuracy@10": 0.9,
12
+ "ndcg@10": 0.7448902792577736,
13
+ "mrr@10": 0.6942023809523811
14
+ },
15
+ "dot_score": {
16
+ "accuracy@1": 0.49,
17
+ "accuracy@3": 0.68,
18
+ "accuracy@5": 0.71,
19
+ "accuracy@10": 0.83,
20
+ "ndcg@10": 0.6537395005077568,
21
+ "mrr@10": 0.5984801587301588
22
+ },
23
+ "euclidean_distance": {
24
+ "accuracy@1": 0.58,
25
+ "accuracy@3": 0.75,
26
+ "accuracy@5": 0.85,
27
+ "accuracy@10": 0.9,
28
+ "ndcg@10": 0.7411266935263704,
29
+ "mrr@10": 0.6896904761904763
30
+ }
31
+ },
32
+ "test_scores": {
33
+ "cosine_similarity": {
34
+ "accuracy@1": 0.6237623762376238,
35
+ "accuracy@3": 0.7896039603960396,
36
+ "accuracy@5": 0.8242574257425742,
37
+ "accuracy@10": 0.8811881188118812,
38
+ "ndcg@10": 0.7531306059721564,
39
+ "mrr@10": 0.7120059327361306
40
+ }
41
+ }
42
+ }
43
+ }
result/STS/scores_jsick.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "spearman",
3
+ "metric_value": 0.8231772134744029,
4
+ "details": {
5
+ "optimal_similarity_metric": "cosine_similarity",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "pearson": 0.8390312744889947,
9
+ "spearman": 0.8309726355825223
10
+ },
11
+ "manhatten_distance": {
12
+ "pearson": 0.8439757378089565,
13
+ "spearman": 0.8296746939532708
14
+ },
15
+ "euclidean_distance": {
16
+ "pearson": 0.8439757378089565,
17
+ "spearman": 0.8296746939532708
18
+ },
19
+ "dot_score": {
20
+ "pearson": 0.8235943624962084,
21
+ "spearman": 0.8066842966908715
22
+ }
23
+ },
24
+ "test_scores": {
25
+ "cosine_similarity": {
26
+ "pearson": 0.8323321086750828,
27
+ "spearman": 0.8231772134744029
28
+ }
29
+ }
30
+ }
31
+ }
result/STS/scores_jsts.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_name": "spearman",
3
+ "metric_value": 0.8342848039994751,
4
+ "details": {
5
+ "optimal_similarity_metric": "manhatten_distance",
6
+ "val_scores": {
7
+ "cosine_similarity": {
8
+ "pearson": 0.8402004412140045,
9
+ "spearman": 0.7947630577888891
10
+ },
11
+ "manhatten_distance": {
12
+ "pearson": 0.8359705278620446,
13
+ "spearman": 0.7954996671020325
14
+ },
15
+ "euclidean_distance": {
16
+ "pearson": 0.8359705278620446,
17
+ "spearman": 0.7954996671020325
18
+ },
19
+ "dot_score": {
20
+ "pearson": 0.8146522053769387,
21
+ "spearman": 0.7576805023715597
22
+ }
23
+ },
24
+ "test_scores": {
25
+ "manhatten_distance": {
26
+ "pearson": 0.8665411120423515,
27
+ "spearman": 0.8342848039994751
28
+ }
29
+ }
30
+ }
31
+ }
result/summary.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Classification": {
3
+ "amazon_counterfactual_classification": {
4
+ "macro_f1": 0.7665550732749669
5
+ },
6
+ "amazon_review_classification": {
7
+ "macro_f1": 0.5575876111411316
8
+ },
9
+ "massive_intent_classification": {
10
+ "macro_f1": 0.8141210121425055
11
+ },
12
+ "massive_scenario_classification": {
13
+ "macro_f1": 0.8848812917656395
14
+ }
15
+ },
16
+ "Reranking": {
17
+ "esci": {
18
+ "ndcg@10": 0.9290942178703699
19
+ }
20
+ },
21
+ "Retrieval": {
22
+ "jagovfaqs_22k": {
23
+ "ndcg@10": 0.7455660589538348
24
+ },
25
+ "jaqket": {
26
+ "ndcg@10": 0.5012253145754781
27
+ },
28
+ "mrtydi": {
29
+ "ndcg@10": 0.3545113073009125
30
+ },
31
+ "nlp_journal_abs_intro": {
32
+ "ndcg@10": 0.8689204088388403
33
+ },
34
+ "nlp_journal_title_abs": {
35
+ "ndcg@10": 0.9656989703684407
36
+ },
37
+ "nlp_journal_title_intro": {
38
+ "ndcg@10": 0.7531306059721564
39
+ }
40
+ },
41
+ "STS": {
42
+ "jsick": {
43
+ "spearman": 0.8231772134744029
44
+ },
45
+ "jsts": {
46
+ "spearman": 0.8342848039994751
47
+ }
48
+ },
49
+ "Clustering": {
50
+ "livedoor_news": {
51
+ "v_measure_score": 0.5427223607801758
52
+ },
53
+ "mewsc16": {
54
+ "v_measure_score": 0.5404099864321413
55
+ }
56
+ },
57
+ "PairClassification": {
58
+ "paws_x_ja": {
59
+ "binary_f1": 0.6237623762376238
60
+ }
61
+ }
62
+ }
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c564888dbafbfebcc7a62d2f4049afe83e6fb5acd50a97082a84407c77ddd8ff
3
+ size 17082922
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "do_subword_tokenize": true,
48
+ "do_word_tokenize": true,
49
+ "jumanpp_kwargs": null,
50
+ "mask_token": "[MASK]",
51
+ "mecab_kwargs": {
52
+ "mecab_dic": "unidic_lite"
53
+ },
54
+ "model_max_length": 512,
55
+ "never_split": null,
56
+ "pad_token": "[PAD]",
57
+ "sep_token": "[SEP]",
58
+ "subword_tokenizer_type": "wordpiece",
59
+ "sudachi_kwargs": null,
60
+ "tokenizer_class": "BertJapaneseTokenizer",
61
+ "unk_token": "[UNK]",
62
+ "word_tokenizer_type": "mecab"
63
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff