nreimers commited on
Commit
e018959
1 Parent(s): 34905fb
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
README.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ ---
8
+
9
+ # {MODEL_NAME}
10
+
11
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
12
+
13
+ <!--- Describe your model here -->
14
+
15
+ ## Usage (Sentence-Transformers)
16
+
17
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
18
+
19
+ ```
20
+ pip install -U sentence-transformers
21
+ ```
22
+
23
+ Then you can use the model like this:
24
+
25
+ ```python
26
+ from sentence_transformers import SentenceTransformer
27
+ sentences = ["This is an example sentence", "Each sentence is converted"]
28
+
29
+ model = SentenceTransformer('{MODEL_NAME}')
30
+ embeddings = model.encode(sentences)
31
+ print(embeddings)
32
+ ```
33
+
34
+
35
+
36
+ ## Evaluation Results
37
+
38
+ <!--- Describe how your model was evaluated -->
39
+
40
+ For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
41
+
42
+
43
+ ## Training
44
+ The model was trained with the parameters:
45
+
46
+ **DataLoader**:
47
+
48
+ `MultiDatasetDataLoader.MultiDatasetDataLoader` of length 5371 with parameters:
49
+ ```
50
+ {'batch_size': 'unknown'}
51
+ ```
52
+
53
+ **Loss**:
54
+
55
+ `sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
56
+ ```
57
+ {'scale': 20, 'similarity_fct': 'dot_score'}
58
+ ```
59
+
60
+ Parameters of the fit()-Method:
61
+ ```
62
+ {
63
+ "callback": null,
64
+ "epochs": 1,
65
+ "evaluation_steps": 0,
66
+ "evaluator": "NoneType",
67
+ "max_grad_norm": 1,
68
+ "optimizer_class": "<class 'transformers.optimization.AdamW'>",
69
+ "optimizer_params": {
70
+ "lr": 2e-05
71
+ },
72
+ "scheduler": "warmupconstant",
73
+ "steps_per_epoch": 10000,
74
+ "warmup_steps": 500,
75
+ "weight_decay": 0.01
76
+ }
77
+ ```
78
+
79
+
80
+ ## Full Model Architecture
81
+ ```
82
+ SentenceTransformer(
83
+ (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel
84
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
85
+ (2): Normalize()
86
+ )
87
+ ```
88
+
89
+ ## Citing & Authors
90
+
91
+ <!--- Describe where people can find more information -->
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilroberta-base",
3
+ "architectures": [
4
+ "RobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 6,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "transformers_version": "4.6.1",
23
+ "type_vocab_size": 1,
24
+ "use_cache": true,
25
+ "vocab_size": 50265
26
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.0.0",
4
+ "transformers": "4.6.1",
5
+ "pytorch": "1.8.1"
6
+ }
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec3c58b3fd1ca767f783848b856c58c38dcaaab8904d267cd55e11387b28b16
3
+ size 328520407
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 128,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "distilroberta-base"}
train_script.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from sentence_transformers import models, losses, datasets
3
+ from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
4
+ from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
5
+ import logging
6
+ from datetime import datetime
7
+ import sys
8
+ import os
9
+ import gzip
10
+ import csv
11
+ from MultiDatasetDataLoader import MultiDatasetDataLoader
12
+ from shutil import copyfile
13
+ import json
14
+ import argparse
15
+
16
+ #### Just some code to print debug information to stdout
17
+ logging.basicConfig(format='%(asctime)s - %(message)s',
18
+ datefmt='%Y-%m-%d %H:%M:%S',
19
+ level=logging.INFO,
20
+ handlers=[LoggingHandler()])
21
+ #### /print debug information to stdout
22
+
23
+
24
+ #model_name = 'distilroberta-base'
25
+ #batch_size_pairs = 200
26
+ #batch_size_triplets = 200
27
+ #steps_per_epoch = 10000
28
+
29
+ parser = argparse.ArgumentParser()
30
+ parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
31
+ parser.add_argument('--steps', type=int, default=2000)
32
+ parser.add_argument('--batch_size_pairs', type=int, default=256)
33
+ parser.add_argument('--batch_size_triplets', type=int, default=256)
34
+ parser.add_argument('--data', nargs='+', default=[])
35
+ parser.add_argument('--name')
36
+ args = parser.parse_args()
37
+
38
+
39
+ model_name = args.model #'nreimers/MiniLM-L6-H384-uncased'
40
+ batch_size_pairs = args.batch_size_pairs #256
41
+ batch_size_triplets = args.batch_size_triplets #256
42
+ steps_per_epoch = args.steps #2000
43
+
44
+ num_epochs = 1
45
+ max_seq_length = 128
46
+ use_amp = True
47
+ warmup_steps = 500
48
+
49
+ #####
50
+
51
+ output_path = 'output/training_data_benchmark-{}-norm-{}'.format(model_name.replace("/", "-"), args.name)
52
+ logging.info("Output: "+output_path)
53
+ if os.path.exists(output_path):
54
+ exit()
55
+
56
+
57
+ # Write train script to output path
58
+ os.makedirs(output_path, exist_ok=True)
59
+
60
+ train_script_path = os.path.join(output_path, 'train_script.py')
61
+ copyfile(__file__, train_script_path)
62
+ with open(train_script_path, 'a') as fOut:
63
+ fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
64
+
65
+ ## SentenceTransformer model
66
+ word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
67
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
68
+ norm = models.Normalize()
69
+ model = SentenceTransformer(modules=[word_embedding_model, pooling_model, norm])
70
+
71
+ datasets = []
72
+ for filepath in args.data:
73
+ filepath = filepath.strip()
74
+ dataset = []
75
+
76
+
77
+ with gzip.open(filepath, 'rt', encoding='utf8') as fIn:
78
+ for line in fIn:
79
+ data = json.loads(line.strip())
80
+
81
+ if not isinstance(data, dict):
82
+ data = {'guid': None, 'texts': data}
83
+
84
+ dataset.append(InputExample(guid=data.get('guid', None), texts=data['texts']))
85
+ if len(dataset) >= (steps_per_epoch * batch_size_pairs * 2):
86
+ break
87
+
88
+ datasets.append(dataset)
89
+ logging.info("{}: {}".format(filepath, len(dataset)))
90
+
91
+
92
+
93
+ train_dataloader = MultiDatasetDataLoader(datasets, batch_size_pairs=batch_size_pairs, batch_size_triplets=batch_size_triplets, random_batch_fraction=0.25)
94
+
95
+
96
+ # Our training loss
97
+ train_loss = losses.MultipleNegativesRankingLoss(model, scale=20, similarity_fct=util.dot_score)
98
+
99
+
100
+
101
+ #Read STSbenchmark dataset and use it as development set
102
+
103
+ # Configure the training
104
+ logging.info("Warmup-steps: {}".format(warmup_steps))
105
+
106
+ # Train the model
107
+ model.fit(train_objectives=[(train_dataloader, train_loss)],
108
+ evaluator=None,
109
+ epochs=1,
110
+ warmup_steps=warmup_steps,
111
+ steps_per_epoch=steps_per_epoch,
112
+ scheduler='warmupconstant',
113
+ use_amp=use_amp
114
+ )
115
+
116
+
117
+ model.save(output_path)
118
+
119
+ # Script was called via:
120
+ #python training_data_benchmark_norm_cos.py --name codesearch-full --model distilroberta-base --steps 10000 --data data/codesearchnet.jsonl.gz
vocab.json ADDED
The diff for this file is too large to render. See raw diff