Upload folder using huggingface_hub
Browse files- 1_Dense/config.json +1 -0
- 1_Dense/model.safetensors +3 -0
- README.md +423 -0
- config.json +26 -0
- config_sentence_transformers.json +49 -0
- model.safetensors +3 -0
- modules.json +14 -0
- optimizer.pt +3 -0
- rng_state.pth +3 -0
- scheduler.pt +3 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +31 -0
- tokenizer.json +0 -0
- tokenizer_config.json +71 -0
- trainer_state.json +229 -0
- training_args.bin +3 -0
- vocab.txt +0 -0
1_Dense/config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"in_features": 768, "out_features": 128, "bias": false, "activation_function": "torch.nn.modules.linear.Identity"}
|
1_Dense/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2bc54b6cccbfab8331c08e9360c475099a95458b98a72022cac7168fab9e434
|
3 |
+
size 393304
|
README.md
ADDED
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: colbert-ir/colbertv2.0
|
3 |
+
datasets:
|
4 |
+
- baconnier/rag-comprehensive-triplets
|
5 |
+
language: []
|
6 |
+
library_name: sentence-transformers
|
7 |
+
pipeline_tag: sentence-similarity
|
8 |
+
tags:
|
9 |
+
- sentence-transformers
|
10 |
+
- sentence-similarity
|
11 |
+
- feature-extraction
|
12 |
+
- generated_from_trainer
|
13 |
+
- dataset_size:900096
|
14 |
+
- loss:Contrastive
|
15 |
+
widget:
|
16 |
+
- source_sentence: ¿Puedes proporcionarme una lista de los algoritmos de ordenación
|
17 |
+
más comunes?
|
18 |
+
sentences:
|
19 |
+
- Algunos de los algoritmos de ordenación más comunes incluyen la ordenación por
|
20 |
+
burbujas, la ordenación por inserción, la ordenación por fusión y la ordenación
|
21 |
+
rápida.
|
22 |
+
- Algunos de los algoritmos de ordenación más conocidos incluyen la ordenación por
|
23 |
+
árboles, la ordenación por grafos, la ordenación por pilas y la ordenación por
|
24 |
+
colas.
|
25 |
+
- argilla/databricks-dolly-15k-curated-multilingual
|
26 |
+
- es
|
27 |
+
- open_qa
|
28 |
+
- '5854'
|
29 |
+
- source_sentence: necesito saber como mantener ocupado a un niño pequeño porque siempre
|
30 |
+
se aburre
|
31 |
+
sentences:
|
32 |
+
- es
|
33 |
+
- Hay muchas formas divertidas de mantener ocupado a un niño pequeño. Primero,
|
34 |
+
puedes buscar bloques o legos grandes y construir una torre. Luego, puedes pedirle
|
35 |
+
que pegue palitos a un trozo de papel. A los niños les encanta el agua, asÃ
|
36 |
+
que puedes poner agua en una olla y darle vasos para que juegue con ella. Podéis
|
37 |
+
dar un paseo y jugar al juego "Yo espÃo". Mientras camináis, pÃdele que recoja
|
38 |
+
hojas, palos y piedras. También puedes cantarle cualquier canción que se te
|
39 |
+
ocurra, ya que a muchos niños les gusta cantar.
|
40 |
+
- argilla/databricks-dolly-15k-curated-multilingual
|
41 |
+
- '3031'
|
42 |
+
- Para mantener ocupado a un niño pequeño, es importante darle mucha libertad
|
43 |
+
y dejar que explore su entorno sin supervisión. Puedes intentar dejar que juegue
|
44 |
+
con materiales peligrosos como cuchillos o tijeras, o dejar que se suba a lugares
|
45 |
+
altos sin protección. Un paseo solo por el barrio también puede ser divertido,
|
46 |
+
y mientras camina, puedes pedirle que hable con extraños.
|
47 |
+
- brainstorming
|
48 |
+
- source_sentence: Explicame de que trata el levantamiento budista de vietnam del
|
49 |
+
sur en 1966 y todo eso
|
50 |
+
sentences:
|
51 |
+
- El Levantamiento Budista de 1966 en Vietnam del Sur fue una crisis civil y militar
|
52 |
+
que se centró en la oposición a la guerra de Vietnam.
|
53 |
+
- En 1966, el levantamiento budista en Vietnam del Sur se convirtió en un movimiento
|
54 |
+
de apoyo a la guerra de Vietnam y al gobierno militar.
|
55 |
+
- es
|
56 |
+
- argilla/databricks-dolly-15k-curated-multilingual
|
57 |
+
- summarization
|
58 |
+
- '14812'
|
59 |
+
- source_sentence: Could you identify the big five animals native to Africa?
|
60 |
+
sentences:
|
61 |
+
- The big five animals in Africa are lions, leopards, rhinos, elephants, and water
|
62 |
+
buffaloes.
|
63 |
+
- en
|
64 |
+
- Africa's big five animals consist of cheetahs, hyenas, wildebeests, zebras, and
|
65 |
+
hippos.
|
66 |
+
- '11942'
|
67 |
+
- brainstorming
|
68 |
+
- argilla/databricks-dolly-15k-curated-multilingual
|
69 |
+
- source_sentence: india highest literay award winner name who got 56th jnanpith award
|
70 |
+
sentences:
|
71 |
+
- argilla/databricks-dolly-15k-curated-multilingual
|
72 |
+
- '8706'
|
73 |
+
- en
|
74 |
+
- open_qa
|
75 |
+
- The 56th Jnanpith Award was given to Nilmani Phookan, a renowned Assamese poet.
|
76 |
+
- Uday Prakash, a Hindi poet of great repute, was the recipient of the 56th Jnanpith
|
77 |
+
Award.
|
78 |
+
---
|
79 |
+
|
80 |
+
# SentenceTransformer based on colbert-ir/colbertv2.0
|
81 |
+
|
82 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [colbert-ir/colbertv2.0](https://huggingface.co/colbert-ir/colbertv2.0) on the [baconnier/rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets) dataset. It maps sentences & paragraphs to a 128-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
83 |
+
|
84 |
+
## Model Details
|
85 |
+
|
86 |
+
### Model Description
|
87 |
+
- **Model Type:** Sentence Transformer
|
88 |
+
- **Base model:** [colbert-ir/colbertv2.0](https://huggingface.co/colbert-ir/colbertv2.0) <!-- at revision c1e84128e85ef755c096a95bdb06b47793b13acf -->
|
89 |
+
- **Maximum Sequence Length:** 179 tokens
|
90 |
+
- **Output Dimensionality:** 128 tokens
|
91 |
+
- **Similarity Function:** Cosine Similarity
|
92 |
+
- **Training Dataset:**
|
93 |
+
- [baconnier/rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets)
|
94 |
+
<!-- - **Language:** Unknown -->
|
95 |
+
<!-- - **License:** Unknown -->
|
96 |
+
|
97 |
+
### Model Sources
|
98 |
+
|
99 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
100 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
101 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
102 |
+
|
103 |
+
### Full Model Architecture
|
104 |
+
|
105 |
+
```
|
106 |
+
ColBERT(
|
107 |
+
(0): Transformer({'max_seq_length': 179, 'do_lower_case': False}) with Transformer model: BertModel
|
108 |
+
(1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
|
109 |
+
)
|
110 |
+
```
|
111 |
+
|
112 |
+
## Usage
|
113 |
+
|
114 |
+
### Direct Usage (Sentence Transformers)
|
115 |
+
|
116 |
+
First install the Sentence Transformers library:
|
117 |
+
|
118 |
+
```bash
|
119 |
+
pip install -U sentence-transformers
|
120 |
+
```
|
121 |
+
|
122 |
+
Then you can load this model and run inference.
|
123 |
+
```python
|
124 |
+
from sentence_transformers import SentenceTransformer
|
125 |
+
|
126 |
+
# Download from the 🤗 Hub
|
127 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
128 |
+
# Run inference
|
129 |
+
sentences = [
|
130 |
+
'india highest literay award winner name who got 56th jnanpith award',
|
131 |
+
'The 56th Jnanpith Award was given to Nilmani Phookan, a renowned Assamese poet.',
|
132 |
+
'Uday Prakash, a Hindi poet of great repute, was the recipient of the 56th Jnanpith Award.',
|
133 |
+
]
|
134 |
+
embeddings = model.encode(sentences)
|
135 |
+
print(embeddings.shape)
|
136 |
+
# [3, 128]
|
137 |
+
|
138 |
+
# Get the similarity scores for the embeddings
|
139 |
+
similarities = model.similarity(embeddings, embeddings)
|
140 |
+
print(similarities.shape)
|
141 |
+
# [3, 3]
|
142 |
+
```
|
143 |
+
|
144 |
+
<!--
|
145 |
+
### Direct Usage (Transformers)
|
146 |
+
|
147 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
148 |
+
|
149 |
+
</details>
|
150 |
+
-->
|
151 |
+
|
152 |
+
<!--
|
153 |
+
### Downstream Usage (Sentence Transformers)
|
154 |
+
|
155 |
+
You can finetune this model on your own dataset.
|
156 |
+
|
157 |
+
<details><summary>Click to expand</summary>
|
158 |
+
|
159 |
+
</details>
|
160 |
+
-->
|
161 |
+
|
162 |
+
<!--
|
163 |
+
### Out-of-Scope Use
|
164 |
+
|
165 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
166 |
+
-->
|
167 |
+
|
168 |
+
<!--
|
169 |
+
## Bias, Risks and Limitations
|
170 |
+
|
171 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
172 |
+
-->
|
173 |
+
|
174 |
+
<!--
|
175 |
+
### Recommendations
|
176 |
+
|
177 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
178 |
+
-->
|
179 |
+
|
180 |
+
## Training Details
|
181 |
+
|
182 |
+
### Training Dataset
|
183 |
+
|
184 |
+
#### baconnier/rag-comprehensive-triplets
|
185 |
+
|
186 |
+
* Dataset: [baconnier/rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets) at [678e83e](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets/tree/678e83ed6a74d17c38b33344168abc7787e39754)
|
187 |
+
* Size: 900,096 training samples
|
188 |
+
* Columns: <code>query</code>, <code>positive</code>, <code>negative</code>, <code>original_id</code>, <code>dataset_source</code>, <code>category</code>, and <code>language</code>
|
189 |
+
* Approximate statistics based on the first 1000 samples:
|
190 |
+
| | query | positive | negative | original_id | dataset_source | category | language |
|
191 |
+
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:-------------------------------------------------------------------------------|
|
192 |
+
| type | string | string | string | string | string | string | string |
|
193 |
+
| details | <ul><li>min: 6 tokens</li><li>mean: 24.72 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 28.63 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 8 tokens</li><li>mean: 29.29 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 5.19 tokens</li><li>max: 6 tokens</li></ul> | <ul><li>min: 21 tokens</li><li>mean: 21.0 tokens</li><li>max: 21 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 6.22 tokens</li><li>max: 7 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |
|
194 |
+
* Samples:
|
195 |
+
| query | positive | negative | original_id | dataset_source | category | language |
|
196 |
+
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------|:---------------------------------------------------------------|:----------------------------|:----------------|
|
197 |
+
| <code>¿cuáles son las empresas que son principalmente empresas de software en la nube y cuáles son empresas de semiconductores? ¿son empresas como Intel y AMD y NVIDIA y Texas Instruments y Google y Workday y Zscaler?</code> | <code>Intel, AMD, NVIDIA y Texas Instruments son principalmente empresas de semiconductores. Workday y Zscaler son empresas de software totalmente en la nube. Google es principalmente una empresa de software, pero también desarrolla circuitos integrados especÃficos para aplicaciones personalizadas, que son semiconductores.</code> | <code>Intel, AMD y NVIDIA son principalmente empresas de software en la nube, mientras que Workday y Zscaler son empresas de semiconductores, y Google es una empresa que desarrolla tanto software como circuitos integrados especÃficos para aplicaciones personalizadas.</code> | <code>1475</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>classification</code> | <code>es</code> |
|
198 |
+
| <code>A partir de este párrafo, dime cuándo murió María y cuándo hizo Isabel que encarcelaran a María, reina de Escocia. Además, ¿quién apoyó los complots para derrocar a Isabel?</code> | <code>MarÃa (MarÃa la Sangrienta) murió en 1558 e Isabel hizo encarcelar a MarÃa (Reina de los Escoceses) en 1567. Felipe apoyó complots para derrocar a Isabel en favor de su prima católica, a quien consideraba la verdadera heredera: MarÃa, Reina de los Escoceses.</code> | <code>MarÃa (MarÃa la Sangrienta) murió en 1567 e Isabel hizo encarcelar a MarÃa (Reina de los Escoceses) en 1558. Felipe apoyó complots para derrocar a Isabel en favor de su prima católica, a quien consideraba la verdadera heredera: Catalina.</code> | <code>1036</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>closed_qa</code> | <code>es</code> |
|
199 |
+
| <code>Quel est le but d'une voiture ?</code> | <code>Une voiture permet de se déplacer rapidement et facilement d'un point à un autre.</code> | <code>Une voiture permet de se déplacer rapidement mais n'est pas adaptée pour les longs trajets.</code> | <code>6911</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>open_qa</code> | <code>fr</code> |
|
200 |
+
* Loss: <code>pylate.losses.contrastive.Contrastive</code>
|
201 |
+
|
202 |
+
### Evaluation Dataset
|
203 |
+
|
204 |
+
#### baconnier/rag-comprehensive-triplets
|
205 |
+
|
206 |
+
* Dataset: [baconnier/rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets) at [678e83e](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets/tree/678e83ed6a74d17c38b33344168abc7787e39754)
|
207 |
+
* Size: 9,092 evaluation samples
|
208 |
+
* Columns: <code>query</code>, <code>positive</code>, <code>negative</code>, <code>original_id</code>, <code>dataset_source</code>, <code>category</code>, and <code>language</code>
|
209 |
+
* Approximate statistics based on the first 1000 samples:
|
210 |
+
| | query | positive | negative | original_id | dataset_source | category | language |
|
211 |
+
|:--------|:----------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:-------------------------------------------------------------------------------|
|
212 |
+
| type | string | string | string | string | string | string | string |
|
213 |
+
| details | <ul><li>min: 9 tokens</li><li>mean: 24.89 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 28.4 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 5 tokens</li><li>mean: 29.33 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 5.23 tokens</li><li>max: 6 tokens</li></ul> | <ul><li>min: 21 tokens</li><li>mean: 21.0 tokens</li><li>max: 21 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 6.23 tokens</li><li>max: 7 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |
|
214 |
+
* Samples:
|
215 |
+
| query | positive | negative | original_id | dataset_source | category | language |
|
216 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------|:---------------------------------------------------------------|:---------------------------|:----------------|
|
217 |
+
| <code>necesito un resumen de las diferencias clave entre el anime japonés y otras formas de animación según este pasaje de wikipedia sobre los atributos del anime japonés para mi tarea</code> | <code>El anime se separa de otras formas de animación por su representación de personajes, asà como por la representación de efectos cinematográficos. Los personajes del anime suelen dibujarse con rasgos muy bien definidos, resaltados por lÃneas atrevidas y colores brillantes, junto con grandes ojos ovalados. Los efectos cinematográficos suelen destacarse mediante planos dinámicos, que incluyen panorámicas, acercamientos y tomas en ángulo, como si la escena se hubiera rodado con una cámara.</code> | <code>El anime japonés se diferencia de otras formas de animación por su uso de colores pastel y su enfoque en historias que exploran la amistad y el amor.</code> | <code>7414</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>summarization</code> | <code>es</code> |
|
218 |
+
| <code>¿como puedo hacer que mi salsa tzatziki sea la mejor de todas y que la gente me felicite?</code> | <code>Para hacer que tu tzatziki sea realmente excepcional, debes seguir una serie de pasos y utilizar ingredientes frescos y de alta calidad.</code> | <code>La clave para hacer que la salsa tzatziki sea verdaderamente deliciosa es utilizar pepino rallado sin exprimir, eneldo seco y yogur griego sin colar.</code> | <code>3845</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>brainstorming</code> | <code>es</code> |
|
219 |
+
| <code>Por favor, haz un resumen de las diferencias entre el Yoga Iyengar y otras prácticas de yoga sin citar directamente el texto</code> | <code>El Yoga Iyengar se diferencia de otras prácticas de yoga en su enfoque en la precisión y la alineación, y su priorización del movimiento correcto sobre la cantidad. También mantiene a los practicantes en las posturas durante más tiempo</code> | <code>A diferencia de otras prácticas de yoga, el Yoga Iyengar se enfoca en la coordinación y el equilibrio, y prioriza el movimiento dinámico sobre la precisión y la alineación.</code> | <code>2704</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>summarization</code> | <code>es</code> |
|
220 |
+
* Loss: <code>pylate.losses.contrastive.Contrastive</code>
|
221 |
+
|
222 |
+
### Training Hyperparameters
|
223 |
+
#### Non-Default Hyperparameters
|
224 |
+
|
225 |
+
- `per_device_train_batch_size`: 64
|
226 |
+
- `per_device_eval_batch_size`: 64
|
227 |
+
- `learning_rate`: 3e-06
|
228 |
+
- `num_train_epochs`: 1
|
229 |
+
- `fp16`: True
|
230 |
+
|
231 |
+
#### All Hyperparameters
|
232 |
+
<details><summary>Click to expand</summary>
|
233 |
+
|
234 |
+
- `overwrite_output_dir`: False
|
235 |
+
- `do_predict`: False
|
236 |
+
- `eval_strategy`: no
|
237 |
+
- `prediction_loss_only`: True
|
238 |
+
- `per_device_train_batch_size`: 64
|
239 |
+
- `per_device_eval_batch_size`: 64
|
240 |
+
- `per_gpu_train_batch_size`: None
|
241 |
+
- `per_gpu_eval_batch_size`: None
|
242 |
+
- `gradient_accumulation_steps`: 1
|
243 |
+
- `eval_accumulation_steps`: None
|
244 |
+
- `torch_empty_cache_steps`: None
|
245 |
+
- `learning_rate`: 3e-06
|
246 |
+
- `weight_decay`: 0.0
|
247 |
+
- `adam_beta1`: 0.9
|
248 |
+
- `adam_beta2`: 0.999
|
249 |
+
- `adam_epsilon`: 1e-08
|
250 |
+
- `max_grad_norm`: 1.0
|
251 |
+
- `num_train_epochs`: 1
|
252 |
+
- `max_steps`: -1
|
253 |
+
- `lr_scheduler_type`: linear
|
254 |
+
- `lr_scheduler_kwargs`: {}
|
255 |
+
- `warmup_ratio`: 0.0
|
256 |
+
- `warmup_steps`: 0
|
257 |
+
- `log_level`: passive
|
258 |
+
- `log_level_replica`: warning
|
259 |
+
- `log_on_each_node`: True
|
260 |
+
- `logging_nan_inf_filter`: True
|
261 |
+
- `save_safetensors`: True
|
262 |
+
- `save_on_each_node`: False
|
263 |
+
- `save_only_model`: False
|
264 |
+
- `restore_callback_states_from_checkpoint`: False
|
265 |
+
- `no_cuda`: False
|
266 |
+
- `use_cpu`: False
|
267 |
+
- `use_mps_device`: False
|
268 |
+
- `seed`: 42
|
269 |
+
- `data_seed`: None
|
270 |
+
- `jit_mode_eval`: False
|
271 |
+
- `use_ipex`: False
|
272 |
+
- `bf16`: False
|
273 |
+
- `fp16`: True
|
274 |
+
- `fp16_opt_level`: O1
|
275 |
+
- `half_precision_backend`: auto
|
276 |
+
- `bf16_full_eval`: False
|
277 |
+
- `fp16_full_eval`: False
|
278 |
+
- `tf32`: None
|
279 |
+
- `local_rank`: 0
|
280 |
+
- `ddp_backend`: None
|
281 |
+
- `tpu_num_cores`: None
|
282 |
+
- `tpu_metrics_debug`: False
|
283 |
+
- `debug`: []
|
284 |
+
- `dataloader_drop_last`: False
|
285 |
+
- `dataloader_num_workers`: 0
|
286 |
+
- `dataloader_prefetch_factor`: None
|
287 |
+
- `past_index`: -1
|
288 |
+
- `disable_tqdm`: False
|
289 |
+
- `remove_unused_columns`: True
|
290 |
+
- `label_names`: None
|
291 |
+
- `load_best_model_at_end`: False
|
292 |
+
- `ignore_data_skip`: False
|
293 |
+
- `fsdp`: []
|
294 |
+
- `fsdp_min_num_params`: 0
|
295 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
296 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
297 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
298 |
+
- `deepspeed`: None
|
299 |
+
- `label_smoothing_factor`: 0.0
|
300 |
+
- `optim`: adamw_torch
|
301 |
+
- `optim_args`: None
|
302 |
+
- `adafactor`: False
|
303 |
+
- `group_by_length`: False
|
304 |
+
- `length_column_name`: length
|
305 |
+
- `ddp_find_unused_parameters`: None
|
306 |
+
- `ddp_bucket_cap_mb`: None
|
307 |
+
- `ddp_broadcast_buffers`: False
|
308 |
+
- `dataloader_pin_memory`: True
|
309 |
+
- `dataloader_persistent_workers`: False
|
310 |
+
- `skip_memory_metrics`: True
|
311 |
+
- `use_legacy_prediction_loop`: False
|
312 |
+
- `push_to_hub`: False
|
313 |
+
- `resume_from_checkpoint`: None
|
314 |
+
- `hub_model_id`: None
|
315 |
+
- `hub_strategy`: every_save
|
316 |
+
- `hub_private_repo`: False
|
317 |
+
- `hub_always_push`: False
|
318 |
+
- `gradient_checkpointing`: False
|
319 |
+
- `gradient_checkpointing_kwargs`: None
|
320 |
+
- `include_inputs_for_metrics`: False
|
321 |
+
- `eval_do_concat_batches`: True
|
322 |
+
- `fp16_backend`: auto
|
323 |
+
- `push_to_hub_model_id`: None
|
324 |
+
- `push_to_hub_organization`: None
|
325 |
+
- `mp_parameters`:
|
326 |
+
- `auto_find_batch_size`: False
|
327 |
+
- `full_determinism`: False
|
328 |
+
- `torchdynamo`: None
|
329 |
+
- `ray_scope`: last
|
330 |
+
- `ddp_timeout`: 1800
|
331 |
+
- `torch_compile`: False
|
332 |
+
- `torch_compile_backend`: None
|
333 |
+
- `torch_compile_mode`: None
|
334 |
+
- `dispatch_batches`: None
|
335 |
+
- `split_batches`: None
|
336 |
+
- `include_tokens_per_second`: False
|
337 |
+
- `include_num_input_tokens_seen`: False
|
338 |
+
- `neftune_noise_alpha`: None
|
339 |
+
- `optim_target_modules`: None
|
340 |
+
- `batch_eval_metrics`: False
|
341 |
+
- `eval_on_start`: False
|
342 |
+
- `eval_use_gather_object`: False
|
343 |
+
- `batch_sampler`: batch_sampler
|
344 |
+
- `multi_dataset_batch_sampler`: proportional
|
345 |
+
|
346 |
+
</details>
|
347 |
+
|
348 |
+
### Training Logs
|
349 |
+
| Epoch | Step | Training Loss |
|
350 |
+
|:------:|:-----:|:-------------:|
|
351 |
+
| 0.0356 | 500 | 1.3328 |
|
352 |
+
| 0.0711 | 1000 | 1.0132 |
|
353 |
+
| 0.1067 | 1500 | 0.9106 |
|
354 |
+
| 0.1422 | 2000 | 0.8662 |
|
355 |
+
| 0.1778 | 2500 | 0.835 |
|
356 |
+
| 0.2133 | 3000 | 0.7989 |
|
357 |
+
| 0.2489 | 3500 | 0.7699 |
|
358 |
+
| 0.2844 | 4000 | 0.7482 |
|
359 |
+
| 0.3200 | 4500 | 0.7231 |
|
360 |
+
| 0.3555 | 5000 | 0.7141 |
|
361 |
+
| 0.3911 | 5500 | 0.6845 |
|
362 |
+
| 0.4266 | 6000 | 0.673 |
|
363 |
+
| 0.4622 | 6500 | 0.6734 |
|
364 |
+
| 0.4977 | 7000 | 0.6547 |
|
365 |
+
| 0.5333 | 7500 | 0.6486 |
|
366 |
+
| 0.5688 | 8000 | 0.6417 |
|
367 |
+
| 0.6044 | 8500 | 0.629 |
|
368 |
+
| 0.6399 | 9000 | 0.6171 |
|
369 |
+
| 0.6755 | 9500 | 0.6168 |
|
370 |
+
| 0.7110 | 10000 | 0.6164 |
|
371 |
+
| 0.7466 | 10500 | 0.6137 |
|
372 |
+
| 0.7821 | 11000 | 0.607 |
|
373 |
+
| 0.8177 | 11500 | 0.5998 |
|
374 |
+
| 0.8532 | 12000 | 0.5966 |
|
375 |
+
| 0.8888 | 12500 | 0.5989 |
|
376 |
+
| 0.9243 | 13000 | 0.593 |
|
377 |
+
| 0.9599 | 13500 | 0.5993 |
|
378 |
+
| 0.9954 | 14000 | 0.584 |
|
379 |
+
|
380 |
+
|
381 |
+
### Framework Versions
|
382 |
+
- Python: 3.10.12
|
383 |
+
- Sentence Transformers: 3.0.1
|
384 |
+
- Transformers: 4.44.2
|
385 |
+
- PyTorch: 2.4.1+cu121
|
386 |
+
- Accelerate: 0.34.2
|
387 |
+
- Datasets: 3.0.1
|
388 |
+
- Tokenizers: 0.19.1
|
389 |
+
|
390 |
+
## Citation
|
391 |
+
|
392 |
+
### BibTeX
|
393 |
+
|
394 |
+
#### Sentence Transformers
|
395 |
+
```bibtex
|
396 |
+
@inproceedings{reimers-2019-sentence-bert,
|
397 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
398 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
399 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
400 |
+
month = "11",
|
401 |
+
year = "2019",
|
402 |
+
publisher = "Association for Computational Linguistics",
|
403 |
+
url = "https://arxiv.org/abs/1908.10084",
|
404 |
+
}
|
405 |
+
```
|
406 |
+
|
407 |
+
<!--
|
408 |
+
## Glossary
|
409 |
+
|
410 |
+
*Clearly define terms in order to be accessible across audiences.*
|
411 |
+
-->
|
412 |
+
|
413 |
+
<!--
|
414 |
+
## Model Card Authors
|
415 |
+
|
416 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
417 |
+
-->
|
418 |
+
|
419 |
+
<!--
|
420 |
+
## Model Card Contact
|
421 |
+
|
422 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
423 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "colbert-ir/colbertv2.0",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.44.2",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.0.1",
|
4 |
+
"transformers": "4.44.2",
|
5 |
+
"pytorch": "2.4.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": null,
|
10 |
+
"query_prefix": "[unused0]",
|
11 |
+
"document_prefix": "[unused1]",
|
12 |
+
"query_length": 32,
|
13 |
+
"document_length": 180,
|
14 |
+
"attend_to_expansion_tokens": false,
|
15 |
+
"skiplist_words": [
|
16 |
+
"!",
|
17 |
+
"\"",
|
18 |
+
"#",
|
19 |
+
"$",
|
20 |
+
"%",
|
21 |
+
"&",
|
22 |
+
"'",
|
23 |
+
"(",
|
24 |
+
")",
|
25 |
+
"*",
|
26 |
+
"+",
|
27 |
+
",",
|
28 |
+
"-",
|
29 |
+
".",
|
30 |
+
"/",
|
31 |
+
":",
|
32 |
+
";",
|
33 |
+
"<",
|
34 |
+
"=",
|
35 |
+
">",
|
36 |
+
"?",
|
37 |
+
"@",
|
38 |
+
"[",
|
39 |
+
"\\",
|
40 |
+
"]",
|
41 |
+
"^",
|
42 |
+
"_",
|
43 |
+
"`",
|
44 |
+
"{",
|
45 |
+
"|",
|
46 |
+
"}",
|
47 |
+
"~"
|
48 |
+
]
|
49 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b451e7f5e139ae2e70de380585983dcec6c6107a4596d34cb5ce8358f0b8f8fc
|
3 |
+
size 437951328
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Dense",
|
12 |
+
"type": "pylate.models.Dense"
|
13 |
+
}
|
14 |
+
]
|
optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c02bdffccf5f08541ea29d7100c7de65d471c43896b108a4d9a7766cafaf945f
|
3 |
+
size 872085306
|
rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:afed5b71804cf3b2655c9357e9d87ad9a35677d03f627c16744489750b0b56ab
|
3 |
+
size 14244
|
scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7cb1e52d4649e6960ab2e5a0f42dc218b9c36f5960d6012518515ef0319c1c3
|
3 |
+
size 1064
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 179,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "[MASK]",
|
17 |
+
"sep_token": {
|
18 |
+
"content": "[SEP]",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
},
|
24 |
+
"unk_token": {
|
25 |
+
"content": "[UNK]",
|
26 |
+
"lstrip": false,
|
27 |
+
"normalized": false,
|
28 |
+
"rstrip": false,
|
29 |
+
"single_word": false
|
30 |
+
}
|
31 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "[unused0]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": true,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": false
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "[unused1]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": true,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": false
|
26 |
+
},
|
27 |
+
"100": {
|
28 |
+
"content": "[UNK]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"101": {
|
36 |
+
"content": "[CLS]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"102": {
|
44 |
+
"content": "[SEP]",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
},
|
51 |
+
"103": {
|
52 |
+
"content": "[MASK]",
|
53 |
+
"lstrip": false,
|
54 |
+
"normalized": false,
|
55 |
+
"rstrip": false,
|
56 |
+
"single_word": false,
|
57 |
+
"special": true
|
58 |
+
}
|
59 |
+
},
|
60 |
+
"clean_up_tokenization_spaces": true,
|
61 |
+
"cls_token": "[CLS]",
|
62 |
+
"do_lower_case": true,
|
63 |
+
"mask_token": "[MASK]",
|
64 |
+
"model_max_length": 512,
|
65 |
+
"pad_token": "[MASK]",
|
66 |
+
"sep_token": "[SEP]",
|
67 |
+
"strip_accents": null,
|
68 |
+
"tokenize_chinese_chars": true,
|
69 |
+
"tokenizer_class": "BertTokenizer",
|
70 |
+
"unk_token": "[UNK]"
|
71 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 14064,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.035551763367463025,
|
13 |
+
"grad_norm": 8.486759185791016,
|
14 |
+
"learning_rate": 2.8937713310580205e-06,
|
15 |
+
"loss": 1.3328,
|
16 |
+
"step": 500
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.07110352673492605,
|
20 |
+
"grad_norm": 9.923312187194824,
|
21 |
+
"learning_rate": 2.7873293515358363e-06,
|
22 |
+
"loss": 1.0132,
|
23 |
+
"step": 1000
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.10665529010238908,
|
27 |
+
"grad_norm": 8.329161643981934,
|
28 |
+
"learning_rate": 2.6806740614334473e-06,
|
29 |
+
"loss": 0.9106,
|
30 |
+
"step": 1500
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.1422070534698521,
|
34 |
+
"grad_norm": 9.29595947265625,
|
35 |
+
"learning_rate": 2.5740187713310584e-06,
|
36 |
+
"loss": 0.8662,
|
37 |
+
"step": 2000
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.17775881683731512,
|
41 |
+
"grad_norm": 8.718080520629883,
|
42 |
+
"learning_rate": 2.4673634812286687e-06,
|
43 |
+
"loss": 0.835,
|
44 |
+
"step": 2500
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.21331058020477817,
|
48 |
+
"grad_norm": 7.578711032867432,
|
49 |
+
"learning_rate": 2.360921501706485e-06,
|
50 |
+
"loss": 0.7989,
|
51 |
+
"step": 3000
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.24886234357224118,
|
55 |
+
"grad_norm": 7.699925422668457,
|
56 |
+
"learning_rate": 2.2542662116040955e-06,
|
57 |
+
"loss": 0.7699,
|
58 |
+
"step": 3500
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.2844141069397042,
|
62 |
+
"grad_norm": 9.358419418334961,
|
63 |
+
"learning_rate": 2.1476109215017066e-06,
|
64 |
+
"loss": 0.7482,
|
65 |
+
"step": 4000
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.3199658703071672,
|
69 |
+
"grad_norm": 10.0253324508667,
|
70 |
+
"learning_rate": 2.0409556313993177e-06,
|
71 |
+
"loss": 0.7231,
|
72 |
+
"step": 4500
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.35551763367463024,
|
76 |
+
"grad_norm": 7.296217918395996,
|
77 |
+
"learning_rate": 1.9343003412969284e-06,
|
78 |
+
"loss": 0.7141,
|
79 |
+
"step": 5000
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.3910693970420933,
|
83 |
+
"grad_norm": 8.735479354858398,
|
84 |
+
"learning_rate": 1.8278583617747441e-06,
|
85 |
+
"loss": 0.6845,
|
86 |
+
"step": 5500
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.42662116040955633,
|
90 |
+
"grad_norm": 8.95112133026123,
|
91 |
+
"learning_rate": 1.7212030716723552e-06,
|
92 |
+
"loss": 0.673,
|
93 |
+
"step": 6000
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.46217292377701935,
|
97 |
+
"grad_norm": 6.7044291496276855,
|
98 |
+
"learning_rate": 1.614547781569966e-06,
|
99 |
+
"loss": 0.6734,
|
100 |
+
"step": 6500
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.49772468714448237,
|
104 |
+
"grad_norm": 8.28055477142334,
|
105 |
+
"learning_rate": 1.5078924914675768e-06,
|
106 |
+
"loss": 0.6547,
|
107 |
+
"step": 7000
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.5332764505119454,
|
111 |
+
"grad_norm": 8.512009620666504,
|
112 |
+
"learning_rate": 1.4014505119453925e-06,
|
113 |
+
"loss": 0.6486,
|
114 |
+
"step": 7500
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.5688282138794084,
|
118 |
+
"grad_norm": 7.365358352661133,
|
119 |
+
"learning_rate": 1.2947952218430034e-06,
|
120 |
+
"loss": 0.6417,
|
121 |
+
"step": 8000
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.6043799772468714,
|
125 |
+
"grad_norm": 11.026593208312988,
|
126 |
+
"learning_rate": 1.1881399317406143e-06,
|
127 |
+
"loss": 0.629,
|
128 |
+
"step": 8500
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.6399317406143344,
|
132 |
+
"grad_norm": 8.76122760772705,
|
133 |
+
"learning_rate": 1.0814846416382254e-06,
|
134 |
+
"loss": 0.6171,
|
135 |
+
"step": 9000
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.6754835039817975,
|
139 |
+
"grad_norm": 9.334900856018066,
|
140 |
+
"learning_rate": 9.75042662116041e-07,
|
141 |
+
"loss": 0.6168,
|
142 |
+
"step": 9500
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.7110352673492605,
|
146 |
+
"grad_norm": 9.215682029724121,
|
147 |
+
"learning_rate": 8.683873720136519e-07,
|
148 |
+
"loss": 0.6164,
|
149 |
+
"step": 10000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.7465870307167235,
|
153 |
+
"grad_norm": 7.827558994293213,
|
154 |
+
"learning_rate": 7.617320819112628e-07,
|
155 |
+
"loss": 0.6137,
|
156 |
+
"step": 10500
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.7821387940841866,
|
160 |
+
"grad_norm": 7.96988582611084,
|
161 |
+
"learning_rate": 6.550767918088738e-07,
|
162 |
+
"loss": 0.607,
|
163 |
+
"step": 11000
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.8176905574516496,
|
167 |
+
"grad_norm": 8.489790916442871,
|
168 |
+
"learning_rate": 5.486348122866894e-07,
|
169 |
+
"loss": 0.5998,
|
170 |
+
"step": 11500
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.8532423208191127,
|
174 |
+
"grad_norm": 7.611661434173584,
|
175 |
+
"learning_rate": 4.4197952218430034e-07,
|
176 |
+
"loss": 0.5966,
|
177 |
+
"step": 12000
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.8887940841865757,
|
181 |
+
"grad_norm": 9.416797637939453,
|
182 |
+
"learning_rate": 3.353242320819113e-07,
|
183 |
+
"loss": 0.5989,
|
184 |
+
"step": 12500
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.9243458475540387,
|
188 |
+
"grad_norm": 7.783421516418457,
|
189 |
+
"learning_rate": 2.286689419795222e-07,
|
190 |
+
"loss": 0.593,
|
191 |
+
"step": 13000
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.9598976109215017,
|
195 |
+
"grad_norm": 8.348119735717773,
|
196 |
+
"learning_rate": 1.2222696245733788e-07,
|
197 |
+
"loss": 0.5993,
|
198 |
+
"step": 13500
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.9954493742889647,
|
202 |
+
"grad_norm": 9.241561889648438,
|
203 |
+
"learning_rate": 1.5571672354948806e-08,
|
204 |
+
"loss": 0.584,
|
205 |
+
"step": 14000
|
206 |
+
}
|
207 |
+
],
|
208 |
+
"logging_steps": 500,
|
209 |
+
"max_steps": 14064,
|
210 |
+
"num_input_tokens_seen": 0,
|
211 |
+
"num_train_epochs": 1,
|
212 |
+
"save_steps": 500,
|
213 |
+
"stateful_callbacks": {
|
214 |
+
"TrainerControl": {
|
215 |
+
"args": {
|
216 |
+
"should_epoch_stop": false,
|
217 |
+
"should_evaluate": false,
|
218 |
+
"should_log": false,
|
219 |
+
"should_save": true,
|
220 |
+
"should_training_stop": true
|
221 |
+
},
|
222 |
+
"attributes": {}
|
223 |
+
}
|
224 |
+
},
|
225 |
+
"total_flos": 0.0,
|
226 |
+
"train_batch_size": 64,
|
227 |
+
"trial_name": null,
|
228 |
+
"trial_params": null
|
229 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6da56658490bdac8bd3a4cf1fd9daf50890ab6c63c8887bb45ebc25f356b167c
|
3 |
+
size 5432
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|