FINGU-AI commited on
Commit
2a0cf32
·
verified ·
1 Parent(s): b6981f4

Upload folder using huggingface_hub

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1536,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
2_Dense/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"in_features": 1536, "out_features": 1024, "bias": true, "activation_function": "torch.nn.modules.linear.Identity"}
2_Dense/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39a89d7ba72da375a547c9468750b9d6bb3fee0b284f902eb5b6df9d8408db53
3
+ size 6295712
README.md CHANGED
@@ -1,3 +1,645 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: dunzhang/stella_en_1.5B_v5
3
+ datasets: []
4
+ language: []
5
+ library_name: sentence-transformers
6
+ metrics:
7
+ - cosine_accuracy@1
8
+ - cosine_accuracy@3
9
+ - cosine_accuracy@5
10
+ - cosine_accuracy@10
11
+ - cosine_precision@1
12
+ - cosine_precision@3
13
+ - cosine_precision@5
14
+ - cosine_precision@10
15
+ - cosine_recall@1
16
+ - cosine_recall@3
17
+ - cosine_recall@5
18
+ - cosine_recall@10
19
+ - cosine_ndcg@10
20
+ - cosine_mrr@10
21
+ - cosine_map@100
22
+ pipeline_tag: sentence-similarity
23
+ tags:
24
+ - sentence-transformers
25
+ - sentence-similarity
26
+ - feature-extraction
27
+ - generated_from_trainer
28
+ - dataset_size:693000
29
+ - loss:MatryoshkaLoss
30
+ - loss:MultipleNegativesRankingLoss
31
+ widget:
32
+ - source_sentence: Paracrystalline materials are defined as having short and medium
33
+ range ordering in their lattice (similar to the liquid crystal phases) but lacking
34
+ crystal-like long-range ordering at least in one direction.
35
+ sentences:
36
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
37
+ query.
38
+
39
+ Query: Paracrystalline'
40
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
41
+ query.
42
+
43
+ Query: Øystein Dahle'
44
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
45
+ query.
46
+
47
+ Query: Makis Belevonis'
48
+ - source_sentence: 'Hạ Trạch is a commune ( xã ) and village in Bố Trạch District
49
+ , Quảng Bình Province , in Vietnam . Category : Populated places in Quang Binh
50
+ Province Category : Communes of Quang Binh Province'
51
+ sentences:
52
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
53
+ query.
54
+
55
+ Query: The Taill of how this forsaid Tod maid his Confessioun to Freir Wolf Waitskaith'
56
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
57
+ query.
58
+
59
+ Query: Hạ Trạch'
60
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
61
+ query.
62
+
63
+ Query: Tadaxa'
64
+ - source_sentence: The Golden Mosque (سنهرى مسجد, Sunehri Masjid) is a mosque in Old
65
+ Delhi. It is located outside the southwestern corner of Delhi Gate of the Red
66
+ Fort, opposite the Netaji Subhash Park.
67
+ sentences:
68
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
69
+ query.
70
+
71
+ Query: Algorithm'
72
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
73
+ query.
74
+
75
+ Query: Golden Mosque (Red Fort)'
76
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
77
+ query.
78
+
79
+ Query: Parnaso Español'
80
+ - source_sentence: Unibank, S.A. is one of Haiti's two largest private commercial
81
+ banks. The bank was founded in 1993 by a group of Haitian investors and is the
82
+ main company of "Groupe Financier National (GFN)". It opened its first office
83
+ in July 1993 in downtown Port-au-Prince and has 50 branches throughout the country
84
+ as of the end of 2016.
85
+ sentences:
86
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
87
+ query.
88
+
89
+ Query: Sky TG24'
90
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
91
+ query.
92
+
93
+ Query: Ghomijeh'
94
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
95
+ query.
96
+
97
+ Query: Unibank (Haiti)'
98
+ - source_sentence: The Tchaikovsky Symphony Orchestra is a Russian classical music
99
+ orchestra established in 1930. It was founded as the Moscow Radio Symphony Orchestra,
100
+ and served as the official symphony for the Soviet All-Union Radio network. Following
101
+ the dissolution of the, Soviet Union in 1991, the orchestra was renamed in 1993
102
+ by the Russian Ministry of Culture in recognition of the central role the music
103
+ of Tchaikovsky plays in its repertoire. The current music director is Vladimir
104
+ Fedoseyev, who has been in that position since 1974.
105
+ sentences:
106
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
107
+ query.
108
+
109
+ Query: Harald J.W. Mueller-Kirsten'
110
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
111
+ query.
112
+
113
+ Query: Sierra del Lacandón'
114
+ - 'Instruct: Given a web search query, retrieve relevant passages that answer the
115
+ query.
116
+
117
+ Query: Tchaikovsky Symphony Orchestra'
118
+ model-index:
119
+ - name: SentenceTransformer based on dunzhang/stella_en_1.5B_v5
120
+ results:
121
+ - task:
122
+ type: information-retrieval
123
+ name: Information Retrieval
124
+ dataset:
125
+ name: Unknown
126
+ type: unknown
127
+ metrics:
128
+ - type: cosine_accuracy@1
129
+ value: 0.9387205387205387
130
+ name: Cosine Accuracy@1
131
+ - type: cosine_accuracy@3
132
+ value: 0.9646464646464646
133
+ name: Cosine Accuracy@3
134
+ - type: cosine_accuracy@5
135
+ value: 0.9734006734006734
136
+ name: Cosine Accuracy@5
137
+ - type: cosine_accuracy@10
138
+ value: 0.9818181818181818
139
+ name: Cosine Accuracy@10
140
+ - type: cosine_precision@1
141
+ value: 0.9387205387205387
142
+ name: Cosine Precision@1
143
+ - type: cosine_precision@3
144
+ value: 0.32154882154882153
145
+ name: Cosine Precision@3
146
+ - type: cosine_precision@5
147
+ value: 0.19468013468013465
148
+ name: Cosine Precision@5
149
+ - type: cosine_precision@10
150
+ value: 0.09818181818181818
151
+ name: Cosine Precision@10
152
+ - type: cosine_recall@1
153
+ value: 0.9387205387205387
154
+ name: Cosine Recall@1
155
+ - type: cosine_recall@3
156
+ value: 0.9646464646464646
157
+ name: Cosine Recall@3
158
+ - type: cosine_recall@5
159
+ value: 0.9734006734006734
160
+ name: Cosine Recall@5
161
+ - type: cosine_recall@10
162
+ value: 0.9818181818181818
163
+ name: Cosine Recall@10
164
+ - type: cosine_ndcg@10
165
+ value: 0.9604189096111768
166
+ name: Cosine Ndcg@10
167
+ - type: cosine_mrr@10
168
+ value: 0.9535509860509859
169
+ name: Cosine Mrr@10
170
+ - type: cosine_map@100
171
+ value: 0.9540030317604424
172
+ name: Cosine Map@100
173
+ - type: cosine_accuracy@1
174
+ value: 0.938047138047138
175
+ name: Cosine Accuracy@1
176
+ - type: cosine_accuracy@3
177
+ value: 0.9643097643097643
178
+ name: Cosine Accuracy@3
179
+ - type: cosine_accuracy@5
180
+ value: 0.9734006734006734
181
+ name: Cosine Accuracy@5
182
+ - type: cosine_accuracy@10
183
+ value: 0.9801346801346801
184
+ name: Cosine Accuracy@10
185
+ - type: cosine_precision@1
186
+ value: 0.938047138047138
187
+ name: Cosine Precision@1
188
+ - type: cosine_precision@3
189
+ value: 0.3214365881032548
190
+ name: Cosine Precision@3
191
+ - type: cosine_precision@5
192
+ value: 0.19468013468013465
193
+ name: Cosine Precision@5
194
+ - type: cosine_precision@10
195
+ value: 0.09801346801346798
196
+ name: Cosine Precision@10
197
+ - type: cosine_recall@1
198
+ value: 0.938047138047138
199
+ name: Cosine Recall@1
200
+ - type: cosine_recall@3
201
+ value: 0.9643097643097643
202
+ name: Cosine Recall@3
203
+ - type: cosine_recall@5
204
+ value: 0.9734006734006734
205
+ name: Cosine Recall@5
206
+ - type: cosine_recall@10
207
+ value: 0.9801346801346801
208
+ name: Cosine Recall@10
209
+ - type: cosine_ndcg@10
210
+ value: 0.9595228125760605
211
+ name: Cosine Ndcg@10
212
+ - type: cosine_mrr@10
213
+ value: 0.9528592806370585
214
+ name: Cosine Mrr@10
215
+ - type: cosine_map@100
216
+ value: 0.9534396603676074
217
+ name: Cosine Map@100
218
+ - type: cosine_accuracy@1
219
+ value: 0.9387205387205387
220
+ name: Cosine Accuracy@1
221
+ - type: cosine_accuracy@3
222
+ value: 0.9622895622895623
223
+ name: Cosine Accuracy@3
224
+ - type: cosine_accuracy@5
225
+ value: 0.9703703703703703
226
+ name: Cosine Accuracy@5
227
+ - type: cosine_accuracy@10
228
+ value: 0.9787878787878788
229
+ name: Cosine Accuracy@10
230
+ - type: cosine_precision@1
231
+ value: 0.9387205387205387
232
+ name: Cosine Precision@1
233
+ - type: cosine_precision@3
234
+ value: 0.3207631874298541
235
+ name: Cosine Precision@3
236
+ - type: cosine_precision@5
237
+ value: 0.19407407407407404
238
+ name: Cosine Precision@5
239
+ - type: cosine_precision@10
240
+ value: 0.09787878787878787
241
+ name: Cosine Precision@10
242
+ - type: cosine_recall@1
243
+ value: 0.9387205387205387
244
+ name: Cosine Recall@1
245
+ - type: cosine_recall@3
246
+ value: 0.9622895622895623
247
+ name: Cosine Recall@3
248
+ - type: cosine_recall@5
249
+ value: 0.9703703703703703
250
+ name: Cosine Recall@5
251
+ - type: cosine_recall@10
252
+ value: 0.9787878787878788
253
+ name: Cosine Recall@10
254
+ - type: cosine_ndcg@10
255
+ value: 0.9588799906525647
256
+ name: Cosine Ndcg@10
257
+ - type: cosine_mrr@10
258
+ value: 0.9525124258457593
259
+ name: Cosine Mrr@10
260
+ - type: cosine_map@100
261
+ value: 0.9530933506069861
262
+ name: Cosine Map@100
263
+ ---
264
+
265
+ # SentenceTransformer based on dunzhang/stella_en_1.5B_v5
266
+
267
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [dunzhang/stella_en_1.5B_v5](https://huggingface.co/dunzhang/stella_en_1.5B_v5). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
268
+
269
+ ## Model Details
270
+
271
+ ### Model Description
272
+ - **Model Type:** Sentence Transformer
273
+ - **Base model:** [dunzhang/stella_en_1.5B_v5](https://huggingface.co/dunzhang/stella_en_1.5B_v5) <!-- at revision 129dc50d3ca5f0f5ee0ce8944f65a8553c0f26e0 -->
274
+ - **Maximum Sequence Length:** 8096 tokens
275
+ - **Output Dimensionality:** 1024 tokens
276
+ - **Similarity Function:** Cosine Similarity
277
+ <!-- - **Training Dataset:** Unknown -->
278
+ <!-- - **Language:** Unknown -->
279
+ <!-- - **License:** Unknown -->
280
+
281
+ ### Model Sources
282
+
283
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
284
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
285
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
286
+
287
+ ### Full Model Architecture
288
+
289
+ ```
290
+ SentenceTransformer(
291
+ (0): Transformer({'max_seq_length': 8096, 'do_lower_case': False}) with Transformer model: Qwen2Model
292
+ (1): Pooling({'word_embedding_dimension': 1536, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
293
+ (2): Dense({'in_features': 1536, 'out_features': 1024, 'bias': True, 'activation_function': 'torch.nn.modules.linear.Identity'})
294
+ )
295
+ ```
296
+
297
+ ## Usage
298
+
299
+ ### Direct Usage (Sentence Transformers)
300
+
301
+ First install the Sentence Transformers library:
302
+
303
+ ```bash
304
+ pip install -U sentence-transformers
305
+ ```
306
+
307
+ Then you can load this model and run inference.
308
+ ```python
309
+ from sentence_transformers import SentenceTransformer
310
+
311
+ # Download from the 🤗 Hub
312
+ model = SentenceTransformer("sentence_transformers_model_id")
313
+ # Run inference
314
+ sentences = [
315
+ 'The Tchaikovsky Symphony Orchestra is a Russian classical music orchestra established in 1930. It was founded as the Moscow Radio Symphony Orchestra, and served as the official symphony for the Soviet All-Union Radio network. Following the dissolution of the, Soviet Union in 1991, the orchestra was renamed in 1993 by the Russian Ministry of Culture in recognition of the central role the music of Tchaikovsky plays in its repertoire. The current music director is Vladimir Fedoseyev, who has been in that position since 1974.',
316
+ 'Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: Tchaikovsky Symphony Orchestra',
317
+ 'Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: Sierra del Lacandón',
318
+ ]
319
+ embeddings = model.encode(sentences)
320
+ print(embeddings.shape)
321
+ # [3, 1024]
322
+
323
+ # Get the similarity scores for the embeddings
324
+ similarities = model.similarity(embeddings, embeddings)
325
+ print(similarities.shape)
326
+ # [3, 3]
327
+ ```
328
+
329
+ <!--
330
+ ### Direct Usage (Transformers)
331
+
332
+ <details><summary>Click to see the direct usage in Transformers</summary>
333
+
334
+ </details>
335
+ -->
336
+
337
+ <!--
338
+ ### Downstream Usage (Sentence Transformers)
339
+
340
+ You can finetune this model on your own dataset.
341
+
342
+ <details><summary>Click to expand</summary>
343
+
344
+ </details>
345
+ -->
346
+
347
+ <!--
348
+ ### Out-of-Scope Use
349
+
350
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
351
+ -->
352
+
353
+ ## Evaluation
354
+
355
+ ### Metrics
356
+
357
+ #### Information Retrieval
358
+
359
+ * Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
360
+
361
+ | Metric | Value |
362
+ |:--------------------|:----------|
363
+ | cosine_accuracy@1 | 0.9387 |
364
+ | cosine_accuracy@3 | 0.9646 |
365
+ | cosine_accuracy@5 | 0.9734 |
366
+ | cosine_accuracy@10 | 0.9818 |
367
+ | cosine_precision@1 | 0.9387 |
368
+ | cosine_precision@3 | 0.3215 |
369
+ | cosine_precision@5 | 0.1947 |
370
+ | cosine_precision@10 | 0.0982 |
371
+ | cosine_recall@1 | 0.9387 |
372
+ | cosine_recall@3 | 0.9646 |
373
+ | cosine_recall@5 | 0.9734 |
374
+ | cosine_recall@10 | 0.9818 |
375
+ | cosine_ndcg@10 | 0.9604 |
376
+ | cosine_mrr@10 | 0.9536 |
377
+ | **cosine_map@100** | **0.954** |
378
+
379
+ #### Information Retrieval
380
+
381
+ * Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
382
+
383
+ | Metric | Value |
384
+ |:--------------------|:-----------|
385
+ | cosine_accuracy@1 | 0.938 |
386
+ | cosine_accuracy@3 | 0.9643 |
387
+ | cosine_accuracy@5 | 0.9734 |
388
+ | cosine_accuracy@10 | 0.9801 |
389
+ | cosine_precision@1 | 0.938 |
390
+ | cosine_precision@3 | 0.3214 |
391
+ | cosine_precision@5 | 0.1947 |
392
+ | cosine_precision@10 | 0.098 |
393
+ | cosine_recall@1 | 0.938 |
394
+ | cosine_recall@3 | 0.9643 |
395
+ | cosine_recall@5 | 0.9734 |
396
+ | cosine_recall@10 | 0.9801 |
397
+ | cosine_ndcg@10 | 0.9595 |
398
+ | cosine_mrr@10 | 0.9529 |
399
+ | **cosine_map@100** | **0.9534** |
400
+
401
+ #### Information Retrieval
402
+
403
+ * Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
404
+
405
+ | Metric | Value |
406
+ |:--------------------|:-----------|
407
+ | cosine_accuracy@1 | 0.9387 |
408
+ | cosine_accuracy@3 | 0.9623 |
409
+ | cosine_accuracy@5 | 0.9704 |
410
+ | cosine_accuracy@10 | 0.9788 |
411
+ | cosine_precision@1 | 0.9387 |
412
+ | cosine_precision@3 | 0.3208 |
413
+ | cosine_precision@5 | 0.1941 |
414
+ | cosine_precision@10 | 0.0979 |
415
+ | cosine_recall@1 | 0.9387 |
416
+ | cosine_recall@3 | 0.9623 |
417
+ | cosine_recall@5 | 0.9704 |
418
+ | cosine_recall@10 | 0.9788 |
419
+ | cosine_ndcg@10 | 0.9589 |
420
+ | cosine_mrr@10 | 0.9525 |
421
+ | **cosine_map@100** | **0.9531** |
422
+
423
+ <!--
424
+ ## Bias, Risks and Limitations
425
+
426
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
427
+ -->
428
+
429
+ <!--
430
+ ### Recommendations
431
+
432
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
433
+ -->
434
+
435
+ ## Training Details
436
+
437
+ ### Training Hyperparameters
438
+ #### Non-Default Hyperparameters
439
+
440
+ - `eval_strategy`: steps
441
+ - `per_device_eval_batch_size`: 4
442
+ - `gradient_accumulation_steps`: 4
443
+ - `learning_rate`: 2e-05
444
+ - `max_steps`: 1500
445
+ - `lr_scheduler_type`: cosine
446
+ - `warmup_ratio`: 0.1
447
+ - `warmup_steps`: 5
448
+ - `bf16`: True
449
+ - `tf32`: True
450
+ - `optim`: adamw_torch_fused
451
+ - `gradient_checkpointing`: True
452
+ - `gradient_checkpointing_kwargs`: {'use_reentrant': False}
453
+ - `batch_sampler`: no_duplicates
454
+
455
+ #### All Hyperparameters
456
+ <details><summary>Click to expand</summary>
457
+
458
+ - `overwrite_output_dir`: False
459
+ - `do_predict`: False
460
+ - `eval_strategy`: steps
461
+ - `prediction_loss_only`: True
462
+ - `per_device_train_batch_size`: 8
463
+ - `per_device_eval_batch_size`: 4
464
+ - `per_gpu_train_batch_size`: None
465
+ - `per_gpu_eval_batch_size`: None
466
+ - `gradient_accumulation_steps`: 4
467
+ - `eval_accumulation_steps`: None
468
+ - `learning_rate`: 2e-05
469
+ - `weight_decay`: 0.0
470
+ - `adam_beta1`: 0.9
471
+ - `adam_beta2`: 0.999
472
+ - `adam_epsilon`: 1e-08
473
+ - `max_grad_norm`: 1.0
474
+ - `num_train_epochs`: 3.0
475
+ - `max_steps`: 1500
476
+ - `lr_scheduler_type`: cosine
477
+ - `lr_scheduler_kwargs`: {}
478
+ - `warmup_ratio`: 0.1
479
+ - `warmup_steps`: 5
480
+ - `log_level`: passive
481
+ - `log_level_replica`: warning
482
+ - `log_on_each_node`: True
483
+ - `logging_nan_inf_filter`: True
484
+ - `save_safetensors`: True
485
+ - `save_on_each_node`: False
486
+ - `save_only_model`: False
487
+ - `restore_callback_states_from_checkpoint`: False
488
+ - `no_cuda`: False
489
+ - `use_cpu`: False
490
+ - `use_mps_device`: False
491
+ - `seed`: 42
492
+ - `data_seed`: None
493
+ - `jit_mode_eval`: False
494
+ - `use_ipex`: False
495
+ - `bf16`: True
496
+ - `fp16`: False
497
+ - `fp16_opt_level`: O1
498
+ - `half_precision_backend`: auto
499
+ - `bf16_full_eval`: False
500
+ - `fp16_full_eval`: False
501
+ - `tf32`: True
502
+ - `local_rank`: 0
503
+ - `ddp_backend`: None
504
+ - `tpu_num_cores`: None
505
+ - `tpu_metrics_debug`: False
506
+ - `debug`: []
507
+ - `dataloader_drop_last`: True
508
+ - `dataloader_num_workers`: 0
509
+ - `dataloader_prefetch_factor`: None
510
+ - `past_index`: -1
511
+ - `disable_tqdm`: False
512
+ - `remove_unused_columns`: True
513
+ - `label_names`: None
514
+ - `load_best_model_at_end`: False
515
+ - `ignore_data_skip`: False
516
+ - `fsdp`: []
517
+ - `fsdp_min_num_params`: 0
518
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
519
+ - `fsdp_transformer_layer_cls_to_wrap`: None
520
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
521
+ - `deepspeed`: None
522
+ - `label_smoothing_factor`: 0.0
523
+ - `optim`: adamw_torch_fused
524
+ - `optim_args`: None
525
+ - `adafactor`: False
526
+ - `group_by_length`: False
527
+ - `length_column_name`: length
528
+ - `ddp_find_unused_parameters`: None
529
+ - `ddp_bucket_cap_mb`: None
530
+ - `ddp_broadcast_buffers`: False
531
+ - `dataloader_pin_memory`: True
532
+ - `dataloader_persistent_workers`: False
533
+ - `skip_memory_metrics`: True
534
+ - `use_legacy_prediction_loop`: False
535
+ - `push_to_hub`: False
536
+ - `resume_from_checkpoint`: None
537
+ - `hub_model_id`: None
538
+ - `hub_strategy`: every_save
539
+ - `hub_private_repo`: False
540
+ - `hub_always_push`: False
541
+ - `gradient_checkpointing`: True
542
+ - `gradient_checkpointing_kwargs`: {'use_reentrant': False}
543
+ - `include_inputs_for_metrics`: False
544
+ - `eval_do_concat_batches`: True
545
+ - `fp16_backend`: auto
546
+ - `push_to_hub_model_id`: None
547
+ - `push_to_hub_organization`: None
548
+ - `mp_parameters`:
549
+ - `auto_find_batch_size`: False
550
+ - `full_determinism`: False
551
+ - `torchdynamo`: None
552
+ - `ray_scope`: last
553
+ - `ddp_timeout`: 1800
554
+ - `torch_compile`: False
555
+ - `torch_compile_backend`: None
556
+ - `torch_compile_mode`: None
557
+ - `dispatch_batches`: None
558
+ - `split_batches`: None
559
+ - `include_tokens_per_second`: False
560
+ - `include_num_input_tokens_seen`: False
561
+ - `neftune_noise_alpha`: None
562
+ - `optim_target_modules`: None
563
+ - `batch_eval_metrics`: False
564
+ - `batch_sampler`: no_duplicates
565
+ - `multi_dataset_batch_sampler`: proportional
566
+
567
+ </details>
568
+
569
+ ### Training Logs
570
+ | Epoch | Step | Training Loss | loss | cosine_map@100 |
571
+ |:------:|:----:|:-------------:|:------:|:--------------:|
572
+ | 0.0185 | 100 | 0.4835 | 0.0751 | 0.9138 |
573
+ | 0.0369 | 200 | 0.0646 | 0.0590 | 0.9384 |
574
+ | 0.0554 | 300 | 0.0594 | 0.0519 | 0.9462 |
575
+ | 0.0739 | 400 | 0.0471 | 0.0483 | 0.9514 |
576
+ | 0.0924 | 500 | 0.0524 | 0.0455 | 0.9531 |
577
+
578
+
579
+ ### Framework Versions
580
+ - Python: 3.10.12
581
+ - Sentence Transformers: 3.0.1
582
+ - Transformers: 4.41.2
583
+ - PyTorch: 2.2.0+cu121
584
+ - Accelerate: 0.33.0
585
+ - Datasets: 2.20.0
586
+ - Tokenizers: 0.19.1
587
+
588
+ ## Citation
589
+
590
+ ### BibTeX
591
+
592
+ #### Sentence Transformers
593
+ ```bibtex
594
+ @inproceedings{reimers-2019-sentence-bert,
595
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
596
+ author = "Reimers, Nils and Gurevych, Iryna",
597
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
598
+ month = "11",
599
+ year = "2019",
600
+ publisher = "Association for Computational Linguistics",
601
+ url = "https://arxiv.org/abs/1908.10084",
602
+ }
603
+ ```
604
+
605
+ #### MatryoshkaLoss
606
+ ```bibtex
607
+ @misc{kusupati2024matryoshka,
608
+ title={Matryoshka Representation Learning},
609
+ author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
610
+ year={2024},
611
+ eprint={2205.13147},
612
+ archivePrefix={arXiv},
613
+ primaryClass={cs.LG}
614
+ }
615
+ ```
616
+
617
+ #### MultipleNegativesRankingLoss
618
+ ```bibtex
619
+ @misc{henderson2017efficient,
620
+ title={Efficient Natural Language Response Suggestion for Smart Reply},
621
+ author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
622
+ year={2017},
623
+ eprint={1705.00652},
624
+ archivePrefix={arXiv},
625
+ primaryClass={cs.CL}
626
+ }
627
+ ```
628
+
629
+ <!--
630
+ ## Glossary
631
+
632
+ *Clearly define terms in order to be accessible across audiences.*
633
+ -->
634
+
635
+ <!--
636
+ ## Model Card Authors
637
+
638
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
639
+ -->
640
+
641
+ <!--
642
+ ## Model Card Contact
643
+
644
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
645
+ -->
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "dunzhang/stella_en_1.5B_v5",
3
+ "architectures": [
4
+ "Qwen2Model"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModel": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2Model",
9
+ "AutoModelForCausalLM": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2ForCausalLM",
10
+ "AutoModelForSequenceClassification": "dunzhang/stella_en_1.5B_v5--modeling_qwen.Qwen2ForSequenceClassification"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "eos_token_id": 151643,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 1536,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 8960,
18
+ "max_position_embeddings": 131072,
19
+ "max_window_layers": 21,
20
+ "model_type": "qwen2",
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 28,
23
+ "num_key_value_heads": 2,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_theta": 1000000.0,
26
+ "sliding_window": 131072,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.41.2",
30
+ "use_cache": true,
31
+ "use_sliding_window": false,
32
+ "vocab_size": 151646
33
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.2.0+cu121"
6
+ },
7
+ "prompts": {
8
+ "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
9
+ "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
10
+ },
11
+ "default_prompt_name": null,
12
+ "similarity_fn_name": "cosine"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7ee0e91dcb228507e173f171a5c5968be4ad204258c68f6cdb24c2e8825e13a
3
+ size 3086574240
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Dense",
18
+ "type": "sentence_transformers.models.Dense"
19
+ }
20
+ ]
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d86be4b11edd5d3dcd69118ec7753a1a73bb654684382e9a9da1632054322f4
3
+ size 6185963010
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c053b46e7a28059e963f2f68cc70993c339964d6c12ea8c75976949b919ea2e
3
+ size 14960
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a0ad6c6adcc1a6927e1ea68485aef9a0c7275e57e60bd35cf2bd42192fa4c67
3
+ size 14960
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75d9c2738d1ea79e02e32a61933fe7ab9ae41612958344b1feb6917ca5028ee
3
+ size 14960
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:596b3989057185ab4b1a72ab4517039e72c4335523b8abd59ee585af6209c251
3
+ size 14960
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a85f0d0c166b31ce5d46d5889fe2e1e0ed59cd0f9d34582c72c5961b2ed34454
3
+ size 1064
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8096,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_eos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [
31
+ "<|im_start|>",
32
+ "<|im_end|>"
33
+ ],
34
+ "auto_map": {
35
+ "AutoTokenizer": [
36
+ "dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2Tokenizer",
37
+ "dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2TokenizerFast"
38
+ ]
39
+ },
40
+ "bos_token": null,
41
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
42
+ "clean_up_tokenization_spaces": false,
43
+ "eos_token": "<|endoftext|>",
44
+ "errors": "replace",
45
+ "model_max_length": 512,
46
+ "pad_token": "<|endoftext|>",
47
+ "split_special_tokens": false,
48
+ "tokenizer_class": "Qwen2Tokenizer",
49
+ "unk_token": null
50
+ }
trainer_state.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.09235315847801995,
5
+ "eval_steps": 100,
6
+ "global_step": 500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01847063169560399,
13
+ "grad_norm": 17.696088790893555,
14
+ "learning_rate": 1.980139427847242e-05,
15
+ "loss": 0.4835,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.01847063169560399,
20
+ "eval_cosine_accuracy@1": 0.8868686868686869,
21
+ "eval_cosine_accuracy@10": 0.9579124579124579,
22
+ "eval_cosine_accuracy@3": 0.9343434343434344,
23
+ "eval_cosine_accuracy@5": 0.9454545454545454,
24
+ "eval_cosine_map@100": 0.9137674463447905,
25
+ "eval_cosine_mrr@10": 0.9124690021912245,
26
+ "eval_cosine_ndcg@10": 0.9236137131767355,
27
+ "eval_cosine_precision@1": 0.8868686868686869,
28
+ "eval_cosine_precision@10": 0.09579124579124577,
29
+ "eval_cosine_precision@3": 0.3114478114478115,
30
+ "eval_cosine_precision@5": 0.18909090909090906,
31
+ "eval_cosine_recall@1": 0.8868686868686869,
32
+ "eval_cosine_recall@10": 0.9579124579124579,
33
+ "eval_cosine_recall@3": 0.9343434343434344,
34
+ "eval_cosine_recall@5": 0.9454545454545454,
35
+ "eval_loss": 0.07506837695837021,
36
+ "eval_runtime": 49.7303,
37
+ "eval_samples_per_second": 140.759,
38
+ "eval_sequential_score": 0.9137674463447905,
39
+ "eval_steps_per_second": 8.808,
40
+ "step": 100
41
+ },
42
+ {
43
+ "epoch": 0.03694126339120798,
44
+ "grad_norm": 2.3271963596343994,
45
+ "learning_rate": 1.917211301505453e-05,
46
+ "loss": 0.0646,
47
+ "step": 200
48
+ },
49
+ {
50
+ "epoch": 0.03694126339120798,
51
+ "eval_cosine_accuracy@1": 0.9195286195286195,
52
+ "eval_cosine_accuracy@10": 0.967003367003367,
53
+ "eval_cosine_accuracy@3": 0.9518518518518518,
54
+ "eval_cosine_accuracy@5": 0.9612794612794613,
55
+ "eval_cosine_map@100": 0.9383862806127067,
56
+ "eval_cosine_mrr@10": 0.937248810859922,
57
+ "eval_cosine_ndcg@10": 0.9445895366693552,
58
+ "eval_cosine_precision@1": 0.9195286195286195,
59
+ "eval_cosine_precision@10": 0.0967003367003367,
60
+ "eval_cosine_precision@3": 0.317283950617284,
61
+ "eval_cosine_precision@5": 0.19225589225589226,
62
+ "eval_cosine_recall@1": 0.9195286195286195,
63
+ "eval_cosine_recall@10": 0.967003367003367,
64
+ "eval_cosine_recall@3": 0.9518518518518518,
65
+ "eval_cosine_recall@5": 0.9612794612794613,
66
+ "eval_loss": 0.05896875262260437,
67
+ "eval_runtime": 51.3489,
68
+ "eval_samples_per_second": 136.322,
69
+ "eval_sequential_score": 0.9383862806127067,
70
+ "eval_steps_per_second": 8.53,
71
+ "step": 200
72
+ },
73
+ {
74
+ "epoch": 0.05541189508681197,
75
+ "grad_norm": 3.7376925945281982,
76
+ "learning_rate": 1.8139290433532415e-05,
77
+ "loss": 0.0594,
78
+ "step": 300
79
+ },
80
+ {
81
+ "epoch": 0.05541189508681197,
82
+ "eval_cosine_accuracy@1": 0.9303030303030303,
83
+ "eval_cosine_accuracy@10": 0.9737373737373738,
84
+ "eval_cosine_accuracy@3": 0.9579124579124579,
85
+ "eval_cosine_accuracy@5": 0.9656565656565657,
86
+ "eval_cosine_map@100": 0.9462389210939364,
87
+ "eval_cosine_mrr@10": 0.9454570840681954,
88
+ "eval_cosine_ndcg@10": 0.9523521034308455,
89
+ "eval_cosine_precision@1": 0.9303030303030303,
90
+ "eval_cosine_precision@10": 0.09737373737373735,
91
+ "eval_cosine_precision@3": 0.31930415263748596,
92
+ "eval_cosine_precision@5": 0.1931313131313131,
93
+ "eval_cosine_recall@1": 0.9303030303030303,
94
+ "eval_cosine_recall@10": 0.9737373737373738,
95
+ "eval_cosine_recall@3": 0.9579124579124579,
96
+ "eval_cosine_recall@5": 0.9656565656565657,
97
+ "eval_loss": 0.051894593983888626,
98
+ "eval_runtime": 49.496,
99
+ "eval_samples_per_second": 141.426,
100
+ "eval_sequential_score": 0.9462389210939364,
101
+ "eval_steps_per_second": 8.849,
102
+ "step": 300
103
+ },
104
+ {
105
+ "epoch": 0.07388252678241596,
106
+ "grad_norm": 0.3877984583377838,
107
+ "learning_rate": 1.6748367163042577e-05,
108
+ "loss": 0.0471,
109
+ "step": 400
110
+ },
111
+ {
112
+ "epoch": 0.07388252678241596,
113
+ "eval_cosine_accuracy@1": 0.9367003367003367,
114
+ "eval_cosine_accuracy@10": 0.9750841750841751,
115
+ "eval_cosine_accuracy@3": 0.9612794612794613,
116
+ "eval_cosine_accuracy@5": 0.969023569023569,
117
+ "eval_cosine_map@100": 0.9513700816079773,
118
+ "eval_cosine_mrr@10": 0.9505351130351131,
119
+ "eval_cosine_ndcg@10": 0.9565510675566292,
120
+ "eval_cosine_precision@1": 0.9367003367003367,
121
+ "eval_cosine_precision@10": 0.09750841750841752,
122
+ "eval_cosine_precision@3": 0.3204264870931538,
123
+ "eval_cosine_precision@5": 0.19380471380471379,
124
+ "eval_cosine_recall@1": 0.9367003367003367,
125
+ "eval_cosine_recall@10": 0.9750841750841751,
126
+ "eval_cosine_recall@3": 0.9612794612794613,
127
+ "eval_cosine_recall@5": 0.969023569023569,
128
+ "eval_loss": 0.04832224175333977,
129
+ "eval_runtime": 49.2695,
130
+ "eval_samples_per_second": 142.076,
131
+ "eval_sequential_score": 0.9513700816079773,
132
+ "eval_steps_per_second": 8.89,
133
+ "step": 400
134
+ },
135
+ {
136
+ "epoch": 0.09235315847801995,
137
+ "grad_norm": 0.9424126744270325,
138
+ "learning_rate": 1.5060539027168317e-05,
139
+ "loss": 0.0524,
140
+ "step": 500
141
+ },
142
+ {
143
+ "epoch": 0.09235315847801995,
144
+ "eval_cosine_accuracy@1": 0.9387205387205387,
145
+ "eval_cosine_accuracy@10": 0.9787878787878788,
146
+ "eval_cosine_accuracy@3": 0.9622895622895623,
147
+ "eval_cosine_accuracy@5": 0.9703703703703703,
148
+ "eval_cosine_map@100": 0.9530933506069861,
149
+ "eval_cosine_mrr@10": 0.9525124258457593,
150
+ "eval_cosine_ndcg@10": 0.9588799906525647,
151
+ "eval_cosine_precision@1": 0.9387205387205387,
152
+ "eval_cosine_precision@10": 0.09787878787878787,
153
+ "eval_cosine_precision@3": 0.3207631874298541,
154
+ "eval_cosine_precision@5": 0.19407407407407404,
155
+ "eval_cosine_recall@1": 0.9387205387205387,
156
+ "eval_cosine_recall@10": 0.9787878787878788,
157
+ "eval_cosine_recall@3": 0.9622895622895623,
158
+ "eval_cosine_recall@5": 0.9703703703703703,
159
+ "eval_loss": 0.04548173025250435,
160
+ "eval_runtime": 49.8784,
161
+ "eval_samples_per_second": 140.341,
162
+ "eval_sequential_score": 0.9530933506069861,
163
+ "eval_steps_per_second": 8.781,
164
+ "step": 500
165
+ }
166
+ ],
167
+ "logging_steps": 100,
168
+ "max_steps": 1500,
169
+ "num_input_tokens_seen": 0,
170
+ "num_train_epochs": 1,
171
+ "save_steps": 500,
172
+ "stateful_callbacks": {
173
+ "TrainerControl": {
174
+ "args": {
175
+ "should_epoch_stop": false,
176
+ "should_evaluate": false,
177
+ "should_log": false,
178
+ "should_save": true,
179
+ "should_training_stop": false
180
+ },
181
+ "attributes": {}
182
+ }
183
+ },
184
+ "total_flos": 0.0,
185
+ "train_batch_size": 8,
186
+ "trial_name": null,
187
+ "trial_params": null
188
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc48324821e670334bde14afaceaab851215544086086930c4482698c24359fd
3
+ size 5368
vocab.json ADDED
The diff for this file is too large to render. See raw diff