Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +432 -0
- config.json +37 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +63 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1024,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:25743
|
8 |
+
- loss:MultipleNegativesRankingLoss
|
9 |
+
base_model: Lajavaness/bilingual-embedding-large
|
10 |
+
widget:
|
11 |
+
- source_sentence: 'Greta Thunberg leads People With Money''s annual list of the "100
|
12 |
+
highest-paid activists" released on Monday. It''s been a rough year for the activist,
|
13 |
+
but at least she has her millions of dollars to ease the pain. Greta Thunberg,
|
14 |
+
16, has been ranked No. 1 on People With Money''s 10 highest-paid activists for
|
15 |
+
2019 with an estimated $46 million in combined earnings. UPDATE 09/24/2019: This
|
16 |
+
story appears to be false. (Read more) Greta Thunberg tops annual list of highest-paid
|
17 |
+
activists In 2017 it seemed that the activist''s spectacular career was coming
|
18 |
+
to an end. Suddenly she was back on top. People With Money reports on Monday (September
|
19 |
+
23) that Thunberg is the highest-paid activist in the world, with an astonishing
|
20 |
+
$46 million between August 2018 and August 2019, a nearly $20 million lead over
|
21 |
+
her closest competition. . Factors of people with money In compiling this annual
|
22 |
+
list, the magazine considers factors such as down payment, profit sharing, residuals,
|
23 |
+
endorsements and advertising work. The Swedish activist has an estimated net worth
|
24 |
+
of $145 million. She owes her fortune to savvy stock investments, substantial
|
25 |
+
property holdings, lucrative endorsement deals with CoverGirl cosmetics. He also
|
26 |
+
owns several restaurants (the "Fat Thunberg Burger" chain) in Stockholm, a football
|
27 |
+
team (the "Stockholm Angels"), has launched his own brand of vodka (Pure Wonderthunberg
|
28 |
+
- Sweden), and is tackling the youth market with best-selling perfume (With Love
|
29 |
+
from Greta) and a fashion line called "Greta Thunberg Seduction." The ranking
|
30 |
+
is significant for many Greta fans, who have been waiting for her triumphant return
|
31 |
+
to the glory days for what seems like a lifetime.'
|
32 |
+
sentences:
|
33 |
+
- Donald Trump next to a stack of declassified files
|
34 |
+
- Greta Thunberg leads People With Money's annual list of the "100 highest-paid
|
35 |
+
activists"
|
36 |
+
- Women should not to use shampoo during their periods “because the pores of the
|
37 |
+
head are open during menstruation and it can cause headache”.
|
38 |
+
- source_sentence: Dr. Bonnie Henry, British Columbia's Provincial Health Officer
|
39 |
+
made a rare Saturday appearance in the media to issue a warning to people who
|
40 |
+
plan on using old pieces of gypsum boards to make their own glory holes, saying
|
41 |
+
that there is a risk of exposure to asbestos and recommends airing on the side
|
42 |
+
of caution and using newer sheets of drywall only. "We realize that many British
|
43 |
+
Columbians have been waiting for a long time for the opportunity to get intimate
|
44 |
+
and a few of those who will choose to use glory holes need to be aware that older
|
45 |
+
gypsum sheet that may be sitting in the basement or backyard will often contain
|
46 |
+
toxic levels of asbestos." Henry told reporters. "The safest method yet to ensure
|
47 |
+
zero risk of exposure to Covid 19 is getting off alone, using your own imagination,
|
48 |
+
toys and electronic devices." The health officer added. "I know this is a topic
|
49 |
+
that makes some people uncomfortable but I need to add that I've come across a
|
50 |
+
few hospital reports of young men getting injured getting their genitals stuck
|
51 |
+
in a small hole they made in a wall with a kitchen knife." Dr, Henry said with
|
52 |
+
a softer tone. "Just to be clear, glory holes are small openings in a wall where
|
53 |
+
a person will offer their intimate parts to a partner who is on the opposite side
|
54 |
+
who will be pleasuring the exposed parts, ideally, until full satisfaction. This
|
55 |
+
is all done in the spirit of doing your share to flatten the curve while having
|
56 |
+
a bit of fun this summer."
|
57 |
+
sentences:
|
58 |
+
- Petai can treat all types of cancer and muscle pain
|
59 |
+
- Terrible scene of dead is prepared to generate fear by covid-19 in the population
|
60 |
+
- BC Minister of Health warned against asbestos poisoning when using glory holes
|
61 |
+
- source_sentence: THIS YOUNG CHADIAN WHO HAS A GUN POINTED ON THE CHEST BY A FRENCH
|
62 |
+
SOLDIER IN HIS COUNTRY BECAUSE HE DARED TO DEMONSTRATE AGAINST THE DESPOTIC REGIME
|
63 |
+
IN PAYMENT TO FRANCE IS THE NEW SYMBOL OF RESISTANCE TO FRENCH NEO-COLONIALISM.
|
64 |
+
Qatar Foundati
|
65 |
+
sentences:
|
66 |
+
- This young Chadian who has a gun pointed at his chest by a French soldier in his
|
67 |
+
country because he dared to demonstrate against the despotic regime in the pay
|
68 |
+
of France
|
69 |
+
- CNN uses footage of an explosion in Ukraine from 2015 to illustrate the current
|
70 |
+
conflict.
|
71 |
+
- Woman gives birth despite having no womb
|
72 |
+
- source_sentence: The Prime Minister of Israel is infected with the Corona virus,
|
73 |
+
pray for him, may God have mercy on us from him
|
74 |
+
sentences:
|
75 |
+
- Photo of Netanyahu in hospital with coronavirus
|
76 |
+
- Documents about Covid-19 written by Dr. Thanin Kongsuk?
|
77 |
+
- Police warn burglars are going "door to door" with contaminated face masks in
|
78 |
+
a new scam
|
79 |
+
- source_sentence: Jose Mujica Rafael Correa The United Nations appoints them like
|
80 |
+
the best presidents in the world AND THEY ARE LEFT
|
81 |
+
sentences:
|
82 |
+
- José Mujica and Rafael Correa were chosen by the United Nations as the best presidents
|
83 |
+
in the world
|
84 |
+
- Vaccination made the number of deaths from Covid-19 this year surpass that of
|
85 |
+
2020
|
86 |
+
- China defeated the new coronavirus
|
87 |
+
pipeline_tag: sentence-similarity
|
88 |
+
library_name: sentence-transformers
|
89 |
+
---
|
90 |
+
|
91 |
+
# SentenceTransformer based on Lajavaness/bilingual-embedding-large
|
92 |
+
|
93 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Lajavaness/bilingual-embedding-large](https://huggingface.co/Lajavaness/bilingual-embedding-large). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
94 |
+
|
95 |
+
## Model Details
|
96 |
+
|
97 |
+
### Model Description
|
98 |
+
- **Model Type:** Sentence Transformer
|
99 |
+
- **Base model:** [Lajavaness/bilingual-embedding-large](https://huggingface.co/Lajavaness/bilingual-embedding-large) <!-- at revision e83179d7a66e8aed1b3015e98bb5ae234ed89598 -->
|
100 |
+
- **Maximum Sequence Length:** 512 tokens
|
101 |
+
- **Output Dimensionality:** 1024 dimensions
|
102 |
+
- **Similarity Function:** Cosine Similarity
|
103 |
+
<!-- - **Training Dataset:** Unknown -->
|
104 |
+
<!-- - **Language:** Unknown -->
|
105 |
+
<!-- - **License:** Unknown -->
|
106 |
+
|
107 |
+
### Model Sources
|
108 |
+
|
109 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
110 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
111 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
112 |
+
|
113 |
+
### Full Model Architecture
|
114 |
+
|
115 |
+
```
|
116 |
+
SentenceTransformer(
|
117 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BilingualModel
|
118 |
+
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
119 |
+
(2): Normalize()
|
120 |
+
)
|
121 |
+
```
|
122 |
+
|
123 |
+
## Usage
|
124 |
+
|
125 |
+
### Direct Usage (Sentence Transformers)
|
126 |
+
|
127 |
+
First install the Sentence Transformers library:
|
128 |
+
|
129 |
+
```bash
|
130 |
+
pip install -U sentence-transformers
|
131 |
+
```
|
132 |
+
|
133 |
+
Then you can load this model and run inference.
|
134 |
+
```python
|
135 |
+
from sentence_transformers import SentenceTransformer
|
136 |
+
|
137 |
+
# Download from the 🤗 Hub
|
138 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
139 |
+
# Run inference
|
140 |
+
sentences = [
|
141 |
+
'Jose Mujica Rafael Correa The United Nations appoints them like the best presidents in the world AND THEY ARE LEFT',
|
142 |
+
'José Mujica and Rafael Correa were chosen by the United Nations as the best presidents in the world',
|
143 |
+
'Vaccination made the number of deaths from Covid-19 this year surpass that of 2020',
|
144 |
+
]
|
145 |
+
embeddings = model.encode(sentences)
|
146 |
+
print(embeddings.shape)
|
147 |
+
# [3, 1024]
|
148 |
+
|
149 |
+
# Get the similarity scores for the embeddings
|
150 |
+
similarities = model.similarity(embeddings, embeddings)
|
151 |
+
print(similarities.shape)
|
152 |
+
# [3, 3]
|
153 |
+
```
|
154 |
+
|
155 |
+
<!--
|
156 |
+
### Direct Usage (Transformers)
|
157 |
+
|
158 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
159 |
+
|
160 |
+
</details>
|
161 |
+
-->
|
162 |
+
|
163 |
+
<!--
|
164 |
+
### Downstream Usage (Sentence Transformers)
|
165 |
+
|
166 |
+
You can finetune this model on your own dataset.
|
167 |
+
|
168 |
+
<details><summary>Click to expand</summary>
|
169 |
+
|
170 |
+
</details>
|
171 |
+
-->
|
172 |
+
|
173 |
+
<!--
|
174 |
+
### Out-of-Scope Use
|
175 |
+
|
176 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
177 |
+
-->
|
178 |
+
|
179 |
+
<!--
|
180 |
+
## Bias, Risks and Limitations
|
181 |
+
|
182 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
183 |
+
-->
|
184 |
+
|
185 |
+
<!--
|
186 |
+
### Recommendations
|
187 |
+
|
188 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
189 |
+
-->
|
190 |
+
|
191 |
+
## Training Details
|
192 |
+
|
193 |
+
### Training Dataset
|
194 |
+
|
195 |
+
#### Unnamed Dataset
|
196 |
+
|
197 |
+
* Size: 25,743 training samples
|
198 |
+
* Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>label</code>
|
199 |
+
* Approximate statistics based on the first 1000 samples:
|
200 |
+
| | sentence_0 | sentence_1 | label |
|
201 |
+
|:--------|:------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:--------------------------------------------------------------|
|
202 |
+
| type | string | string | float |
|
203 |
+
| details | <ul><li>min: 2 tokens</li><li>mean: 115.14 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 19.63 tokens</li><li>max: 137 tokens</li></ul> | <ul><li>min: 1.0</li><li>mean: 1.0</li><li>max: 1.0</li></ul> |
|
204 |
+
* Samples:
|
205 |
+
| sentence_0 | sentence_1 | label |
|
206 |
+
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------|:-----------------|
|
207 |
+
| <code>Even though India is locked down today, but all the cities of China are open and from April 8, China has announced to open Wuhan as well, not a single leader in China, a single military leader, even a big man. Corona is not done Corona virus has ruined the economy all over the world, thousands of lives have been lost, lakhs have got this disease and countless people have been locked in homes, lock down has happened in many countries in which India is also one. Is The corona virus originated from the city of Wuhan in China, and now it has reached every corner of the world, but the virus did not reach the Chinese capital Beijing and the economic capital Shanghai near Wuhan. Today Paris is closed, New York is closed, Berlin is closed, Rome is closed, Delhi is closed, Mumbai is closed, Tokyo is closed, major economic and political centers of the world are closed but Beijing and Shanghai are open, there Corona has Showed no effect, only a few cases were reported, but in a way, there was no ...</code> | <code>Shanghai and Beijing, China's major cities, are untouched by COVID-19</code> | <code>1.0</code> |
|
208 |
+
| <code>This virus is kind of weak.... it hasn't killed any politicians either here or in China!</code> | <code>"How strange, a virus so deadly that it didn't kill any politician... Is the "virus" corrupt?"</code> | <code>1.0</code> |
|
209 |
+
| <code>Korean environmental group dies 443 Korea is mixed. This is a mountain in a certain area of Korea. All the trees were cut down and the whole mountain was filled with solar panels. It covered me, but disaster will soon come and I will be heartbroken All. This disaster is no genius. Talent.</code> | <code>This is a mountain in South Korea covered with solar panels</code> | <code>1.0</code> |
|
210 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
211 |
+
```json
|
212 |
+
{
|
213 |
+
"scale": 20.0,
|
214 |
+
"similarity_fct": "cos_sim"
|
215 |
+
}
|
216 |
+
```
|
217 |
+
|
218 |
+
### Training Hyperparameters
|
219 |
+
#### Non-Default Hyperparameters
|
220 |
+
|
221 |
+
- `per_device_train_batch_size`: 2
|
222 |
+
- `per_device_eval_batch_size`: 2
|
223 |
+
- `num_train_epochs`: 1
|
224 |
+
- `fp16`: True
|
225 |
+
- `multi_dataset_batch_sampler`: round_robin
|
226 |
+
|
227 |
+
#### All Hyperparameters
|
228 |
+
<details><summary>Click to expand</summary>
|
229 |
+
|
230 |
+
- `overwrite_output_dir`: False
|
231 |
+
- `do_predict`: False
|
232 |
+
- `eval_strategy`: no
|
233 |
+
- `prediction_loss_only`: True
|
234 |
+
- `per_device_train_batch_size`: 2
|
235 |
+
- `per_device_eval_batch_size`: 2
|
236 |
+
- `per_gpu_train_batch_size`: None
|
237 |
+
- `per_gpu_eval_batch_size`: None
|
238 |
+
- `gradient_accumulation_steps`: 1
|
239 |
+
- `eval_accumulation_steps`: None
|
240 |
+
- `torch_empty_cache_steps`: None
|
241 |
+
- `learning_rate`: 5e-05
|
242 |
+
- `weight_decay`: 0.0
|
243 |
+
- `adam_beta1`: 0.9
|
244 |
+
- `adam_beta2`: 0.999
|
245 |
+
- `adam_epsilon`: 1e-08
|
246 |
+
- `max_grad_norm`: 1
|
247 |
+
- `num_train_epochs`: 1
|
248 |
+
- `max_steps`: -1
|
249 |
+
- `lr_scheduler_type`: linear
|
250 |
+
- `lr_scheduler_kwargs`: {}
|
251 |
+
- `warmup_ratio`: 0.0
|
252 |
+
- `warmup_steps`: 0
|
253 |
+
- `log_level`: passive
|
254 |
+
- `log_level_replica`: warning
|
255 |
+
- `log_on_each_node`: True
|
256 |
+
- `logging_nan_inf_filter`: True
|
257 |
+
- `save_safetensors`: True
|
258 |
+
- `save_on_each_node`: False
|
259 |
+
- `save_only_model`: False
|
260 |
+
- `restore_callback_states_from_checkpoint`: False
|
261 |
+
- `no_cuda`: False
|
262 |
+
- `use_cpu`: False
|
263 |
+
- `use_mps_device`: False
|
264 |
+
- `seed`: 42
|
265 |
+
- `data_seed`: None
|
266 |
+
- `jit_mode_eval`: False
|
267 |
+
- `use_ipex`: False
|
268 |
+
- `bf16`: False
|
269 |
+
- `fp16`: True
|
270 |
+
- `fp16_opt_level`: O1
|
271 |
+
- `half_precision_backend`: auto
|
272 |
+
- `bf16_full_eval`: False
|
273 |
+
- `fp16_full_eval`: False
|
274 |
+
- `tf32`: None
|
275 |
+
- `local_rank`: 0
|
276 |
+
- `ddp_backend`: None
|
277 |
+
- `tpu_num_cores`: None
|
278 |
+
- `tpu_metrics_debug`: False
|
279 |
+
- `debug`: []
|
280 |
+
- `dataloader_drop_last`: False
|
281 |
+
- `dataloader_num_workers`: 0
|
282 |
+
- `dataloader_prefetch_factor`: None
|
283 |
+
- `past_index`: -1
|
284 |
+
- `disable_tqdm`: False
|
285 |
+
- `remove_unused_columns`: True
|
286 |
+
- `label_names`: None
|
287 |
+
- `load_best_model_at_end`: False
|
288 |
+
- `ignore_data_skip`: False
|
289 |
+
- `fsdp`: []
|
290 |
+
- `fsdp_min_num_params`: 0
|
291 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
292 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
293 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
294 |
+
- `deepspeed`: None
|
295 |
+
- `label_smoothing_factor`: 0.0
|
296 |
+
- `optim`: adamw_torch
|
297 |
+
- `optim_args`: None
|
298 |
+
- `adafactor`: False
|
299 |
+
- `group_by_length`: False
|
300 |
+
- `length_column_name`: length
|
301 |
+
- `ddp_find_unused_parameters`: None
|
302 |
+
- `ddp_bucket_cap_mb`: None
|
303 |
+
- `ddp_broadcast_buffers`: False
|
304 |
+
- `dataloader_pin_memory`: True
|
305 |
+
- `dataloader_persistent_workers`: False
|
306 |
+
- `skip_memory_metrics`: True
|
307 |
+
- `use_legacy_prediction_loop`: False
|
308 |
+
- `push_to_hub`: False
|
309 |
+
- `resume_from_checkpoint`: None
|
310 |
+
- `hub_model_id`: None
|
311 |
+
- `hub_strategy`: every_save
|
312 |
+
- `hub_private_repo`: None
|
313 |
+
- `hub_always_push`: False
|
314 |
+
- `gradient_checkpointing`: False
|
315 |
+
- `gradient_checkpointing_kwargs`: None
|
316 |
+
- `include_inputs_for_metrics`: False
|
317 |
+
- `include_for_metrics`: []
|
318 |
+
- `eval_do_concat_batches`: True
|
319 |
+
- `fp16_backend`: auto
|
320 |
+
- `push_to_hub_model_id`: None
|
321 |
+
- `push_to_hub_organization`: None
|
322 |
+
- `mp_parameters`:
|
323 |
+
- `auto_find_batch_size`: False
|
324 |
+
- `full_determinism`: False
|
325 |
+
- `torchdynamo`: None
|
326 |
+
- `ray_scope`: last
|
327 |
+
- `ddp_timeout`: 1800
|
328 |
+
- `torch_compile`: False
|
329 |
+
- `torch_compile_backend`: None
|
330 |
+
- `torch_compile_mode`: None
|
331 |
+
- `dispatch_batches`: None
|
332 |
+
- `split_batches`: None
|
333 |
+
- `include_tokens_per_second`: False
|
334 |
+
- `include_num_input_tokens_seen`: False
|
335 |
+
- `neftune_noise_alpha`: None
|
336 |
+
- `optim_target_modules`: None
|
337 |
+
- `batch_eval_metrics`: False
|
338 |
+
- `eval_on_start`: False
|
339 |
+
- `use_liger_kernel`: False
|
340 |
+
- `eval_use_gather_object`: False
|
341 |
+
- `average_tokens_across_devices`: False
|
342 |
+
- `prompts`: None
|
343 |
+
- `batch_sampler`: batch_sampler
|
344 |
+
- `multi_dataset_batch_sampler`: round_robin
|
345 |
+
|
346 |
+
</details>
|
347 |
+
|
348 |
+
### Training Logs
|
349 |
+
| Epoch | Step | Training Loss |
|
350 |
+
|:------:|:-----:|:-------------:|
|
351 |
+
| 0.0388 | 500 | 0.0334 |
|
352 |
+
| 0.0777 | 1000 | 0.0595 |
|
353 |
+
| 0.1165 | 1500 | 0.0597 |
|
354 |
+
| 0.1554 | 2000 | 0.046 |
|
355 |
+
| 0.1942 | 2500 | 0.0238 |
|
356 |
+
| 0.2331 | 3000 | 0.0667 |
|
357 |
+
| 0.2719 | 3500 | 0.0283 |
|
358 |
+
| 0.3108 | 4000 | 0.0429 |
|
359 |
+
| 0.3496 | 4500 | 0.0414 |
|
360 |
+
| 0.3884 | 5000 | 0.0295 |
|
361 |
+
| 0.4273 | 5500 | 0.0323 |
|
362 |
+
| 0.4661 | 6000 | 0.0288 |
|
363 |
+
| 0.5050 | 6500 | 0.0389 |
|
364 |
+
| 0.5438 | 7000 | 0.0399 |
|
365 |
+
| 0.5827 | 7500 | 0.0245 |
|
366 |
+
| 0.6215 | 8000 | 0.0334 |
|
367 |
+
| 0.6603 | 8500 | 0.0212 |
|
368 |
+
| 0.6992 | 9000 | 0.0207 |
|
369 |
+
| 0.7380 | 9500 | 0.0206 |
|
370 |
+
| 0.7769 | 10000 | 0.0163 |
|
371 |
+
| 0.8157 | 10500 | 0.0318 |
|
372 |
+
| 0.8546 | 11000 | 0.0256 |
|
373 |
+
| 0.8934 | 11500 | 0.0277 |
|
374 |
+
| 0.9323 | 12000 | 0.027 |
|
375 |
+
| 0.9711 | 12500 | 0.0179 |
|
376 |
+
|
377 |
+
|
378 |
+
### Framework Versions
|
379 |
+
- Python: 3.11.11
|
380 |
+
- Sentence Transformers: 3.4.1
|
381 |
+
- Transformers: 4.48.3
|
382 |
+
- PyTorch: 2.5.1+cu124
|
383 |
+
- Accelerate: 1.3.0
|
384 |
+
- Datasets: 3.3.1
|
385 |
+
- Tokenizers: 0.21.0
|
386 |
+
|
387 |
+
## Citation
|
388 |
+
|
389 |
+
### BibTeX
|
390 |
+
|
391 |
+
#### Sentence Transformers
|
392 |
+
```bibtex
|
393 |
+
@inproceedings{reimers-2019-sentence-bert,
|
394 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
395 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
396 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
397 |
+
month = "11",
|
398 |
+
year = "2019",
|
399 |
+
publisher = "Association for Computational Linguistics",
|
400 |
+
url = "https://arxiv.org/abs/1908.10084",
|
401 |
+
}
|
402 |
+
```
|
403 |
+
|
404 |
+
#### MultipleNegativesRankingLoss
|
405 |
+
```bibtex
|
406 |
+
@misc{henderson2017efficient,
|
407 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
408 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
409 |
+
year={2017},
|
410 |
+
eprint={1705.00652},
|
411 |
+
archivePrefix={arXiv},
|
412 |
+
primaryClass={cs.CL}
|
413 |
+
}
|
414 |
+
```
|
415 |
+
|
416 |
+
<!--
|
417 |
+
## Glossary
|
418 |
+
|
419 |
+
*Clearly define terms in order to be accessible across audiences.*
|
420 |
+
-->
|
421 |
+
|
422 |
+
<!--
|
423 |
+
## Model Card Authors
|
424 |
+
|
425 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
426 |
+
-->
|
427 |
+
|
428 |
+
<!--
|
429 |
+
## Model Card Contact
|
430 |
+
|
431 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
432 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Lajavaness/bilingual-embedding-large",
|
3 |
+
"architectures": [
|
4 |
+
"BilingualModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "dangvantuan/bilingual_impl--config.BilingualConfig",
|
9 |
+
"AutoModel": "dangvantuan/bilingual_impl--modeling.BilingualModel",
|
10 |
+
"AutoModelForMaskedLM": "dangvantuan/bilingual_impl--modeling.BilingualForMaskedLM",
|
11 |
+
"AutoModelForMultipleChoice": "dangvantuan/bilingual_impl--modeling.BilingualForMultipleChoice",
|
12 |
+
"AutoModelForQuestionAnswering": "dangvantuan/bilingual_impl--modeling.BilingualForQuestionAnswering",
|
13 |
+
"AutoModelForSequenceClassification": "dangvantuan/bilingual_impl--modeling.BilingualForSequenceClassification",
|
14 |
+
"AutoModelForTokenClassification": "dangvantuan/bilingual_impl--modeling.BilingualForTokenClassification"
|
15 |
+
},
|
16 |
+
"bos_token_id": 0,
|
17 |
+
"classifier_dropout": null,
|
18 |
+
"eos_token_id": 2,
|
19 |
+
"hidden_act": "gelu",
|
20 |
+
"hidden_dropout_prob": 0.1,
|
21 |
+
"hidden_size": 1024,
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 4096,
|
24 |
+
"layer_norm_eps": 1e-05,
|
25 |
+
"max_position_embeddings": 514,
|
26 |
+
"model_type": "bilingual",
|
27 |
+
"num_attention_heads": 16,
|
28 |
+
"num_hidden_layers": 24,
|
29 |
+
"output_past": true,
|
30 |
+
"pad_token_id": 1,
|
31 |
+
"position_embedding_type": "absolute",
|
32 |
+
"torch_dtype": "float32",
|
33 |
+
"transformers_version": "4.48.3",
|
34 |
+
"type_vocab_size": 1,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 250002
|
37 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.4.1",
|
4 |
+
"transformers": "4.48.3",
|
5 |
+
"pytorch": "2.5.1+cu124"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ebb5653a532fa5854e33ef29925a25e19ebefc9d13f2895fc80f9262308491cb
|
3 |
+
size 2239607176
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
|
3 |
+
size 17082987
|
tokenizer_config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"additional_special_tokens": [],
|
45 |
+
"bos_token": "<s>",
|
46 |
+
"clean_up_tokenization_spaces": true,
|
47 |
+
"cls_token": "<s>",
|
48 |
+
"eos_token": "</s>",
|
49 |
+
"extra_special_tokens": {},
|
50 |
+
"mask_token": "<mask>",
|
51 |
+
"max_length": 512,
|
52 |
+
"model_max_length": 512,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "<pad>",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "</s>",
|
58 |
+
"stride": 0,
|
59 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
60 |
+
"truncation_side": "right",
|
61 |
+
"truncation_strategy": "longest_first",
|
62 |
+
"unk_token": "<unk>"
|
63 |
+
}
|