michaelfeil commited on
Commit
b725d6e
1 Parent(s): d7e28b9

Upload setu4993/LaBSE ctranslate fp16 weights

Browse files
.gitattributes CHANGED
@@ -1,34 +1,12 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
10
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
11
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
12
+ vocabulary.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - af
4
+ - am
5
+ - ar
6
+ - as
7
+ - az
8
+ - be
9
+ - bg
10
+ - bn
11
+ - bo
12
+ - bs
13
+ - ca
14
+ - ceb
15
+ - co
16
+ - cs
17
+ - cy
18
+ - da
19
+ - de
20
+ - el
21
+ - en
22
+ - eo
23
+ - es
24
+ - et
25
+ - eu
26
+ - fa
27
+ - fi
28
+ - fr
29
+ - fy
30
+ - ga
31
+ - gd
32
+ - gl
33
+ - gu
34
+ - ha
35
+ - haw
36
+ - he
37
+ - hi
38
+ - hmn
39
+ - hr
40
+ - ht
41
+ - hu
42
+ - hy
43
+ - id
44
+ - ig
45
+ - is
46
+ - it
47
+ - ja
48
+ - jv
49
+ - ka
50
+ - kk
51
+ - km
52
+ - kn
53
+ - ko
54
+ - ku
55
+ - ky
56
+ - la
57
+ - lb
58
+ - lo
59
+ - lt
60
+ - lv
61
+ - mg
62
+ - mi
63
+ - mk
64
+ - ml
65
+ - mn
66
+ - mr
67
+ - ms
68
+ - mt
69
+ - my
70
+ - ne
71
+ - nl
72
+ - no
73
+ - ny
74
+ - or
75
+ - pa
76
+ - pl
77
+ - pt
78
+ - ro
79
+ - ru
80
+ - rw
81
+ - si
82
+ - sk
83
+ - sl
84
+ - sm
85
+ - sn
86
+ - so
87
+ - sq
88
+ - sr
89
+ - st
90
+ - su
91
+ - sv
92
+ - sw
93
+ - ta
94
+ - te
95
+ - tg
96
+ - th
97
+ - tk
98
+ - tl
99
+ - tr
100
+ - tt
101
+ - ug
102
+ - uk
103
+ - ur
104
+ - uz
105
+ - vi
106
+ - wo
107
+ - xh
108
+ - yi
109
+ - yo
110
+ - zh
111
+ - zu
112
+ tags:
113
+ - ctranslate2
114
+ - int8
115
+ - float16
116
+ - bert
117
+ - sentence_embedding
118
+ - multilingual
119
+ - google
120
+ - sentence-similarity
121
+ license: apache-2.0
122
+ datasets:
123
+ - CommonCrawl
124
+ - Wikipedia
125
+ ---
126
+ # # Fast-Inference with Ctranslate2
127
+ Speedup inference while reducing memory by 2x-4x using int8 inference in C++ on CPU or GPU.
128
+
129
+ quantized version of [setu4993/LaBSE](https://huggingface.co/setu4993/LaBSE)
130
+ ```bash
131
+ pip install hf-hub-ctranslate2>=2.10.0 ctranslate2>=3.16.0
132
+ ```
133
+
134
+ ```python
135
+ # from transformers import AutoTokenizer
136
+ model_name = "michaelfeil/ct2fast-LaBSE"
137
+
138
+ from hf_hub_ctranslate2 import EncoderCT2fromHfHub
139
+ model = EncoderCT2fromHfHub(
140
+ # load in int8 on CUDA
141
+ model_name_or_path=model_name,
142
+ device="cuda",
143
+ compute_type="float16",
144
+ # tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}")
145
+ )
146
+ embeddings = model.encode(
147
+ ["I like soccer", "I like tennis", "The eiffel tower is in Paris"],
148
+ batch_size=32,
149
+ convert_to_numpy=True,
150
+ normalize_embeddings=True,
151
+ )
152
+ print(embeddings.shape, embeddings)
153
+ scores = (embeddings @ embeddings.T) * 100
154
+
155
+ ```
156
+
157
+ Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2)
158
+ and [hf-hub-ctranslate2>=2.10.0](https://github.com/michaelfeil/hf-hub-ctranslate2)
159
+ - `compute_type=int8_float16` for `device="cuda"`
160
+ - `compute_type=int8` for `device="cpu"`
161
+
162
+ Converted on 2023-06-16 using
163
+ ```
164
+ ct2-transformers-converter --model setu4993/LaBSE --output_dir ~/tmp-ct2fast-LaBSE --force --copy_files tokenizer.json README.md tokenizer_config.json vocab.txt special_tokens_map.json .gitattributes --quantization float16 --trust_remote_code
165
+ ```
166
+
167
+ # Licence and other remarks:
168
+ This is just a quantized version. Licence conditions are intended to be idential to original huggingface repo.
169
+
170
+ # Original description
171
+
172
+
173
+ # LaBSE
174
+
175
+ ## Model description
176
+
177
+ Language-agnostic BERT Sentence Encoder (LaBSE) is a BERT-based model trained for sentence embedding for 109 languages. The pre-training process combines masked language modeling with translation language modeling. The model is useful for getting multilingual sentence embeddings and for bi-text retrieval.
178
+
179
+ - Model: [HuggingFace's model hub](https://huggingface.co/setu4993/LaBSE).
180
+ - Paper: [arXiv](https://arxiv.org/abs/2007.01852).
181
+ - Original model: [TensorFlow Hub](https://tfhub.dev/google/LaBSE/2).
182
+ - Blog post: [Google AI Blog](https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html).
183
+ - Conversion from TensorFlow to PyTorch: [GitHub](https://github.com/setu4993/convert-labse-tf-pt).
184
+
185
+ This is migrated from the v2 model on the TF Hub, which uses dict-based input. The embeddings produced by both the versions of the model are [equivalent](https://github.com/setu4993/convert-labse-tf-pt/blob/ec3a019159a54ed6493181a64486c2808c01f216/tests/test_conversion.py#L31).
186
+
187
+ ## Usage
188
+
189
+ Using the model:
190
+
191
+ ```python
192
+ import torch
193
+ from transformers import BertModel, BertTokenizerFast
194
+
195
+
196
+ tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
197
+ model = BertModel.from_pretrained("setu4993/LaBSE")
198
+ model = model.eval()
199
+
200
+ english_sentences = [
201
+ "dog",
202
+ "Puppies are nice.",
203
+ "I enjoy taking long walks along the beach with my dog.",
204
+ ]
205
+ english_inputs = tokenizer(english_sentences, return_tensors="pt", padding=True)
206
+
207
+ with torch.no_grad():
208
+ english_outputs = model(**english_inputs)
209
+ ```
210
+
211
+ To get the sentence embeddings, use the pooler output:
212
+
213
+ ```python
214
+ english_embeddings = english_outputs.pooler_output
215
+ ```
216
+
217
+ Output for other languages:
218
+
219
+ ```python
220
+ italian_sentences = [
221
+ "cane",
222
+ "I cuccioli sono carini.",
223
+ "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.",
224
+ ]
225
+ japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]
226
+ italian_inputs = tokenizer(italian_sentences, return_tensors="pt", padding=True)
227
+ japanese_inputs = tokenizer(japanese_sentences, return_tensors="pt", padding=True)
228
+
229
+ with torch.no_grad():
230
+ italian_outputs = model(**italian_inputs)
231
+ japanese_outputs = model(**japanese_inputs)
232
+
233
+ italian_embeddings = italian_outputs.pooler_output
234
+ japanese_embeddings = japanese_outputs.pooler_output
235
+ ```
236
+
237
+ For similarity between sentences, an L2-norm is recommended before calculating the similarity:
238
+
239
+ ```python
240
+ import torch.nn.functional as F
241
+
242
+
243
+ def similarity(embeddings_1, embeddings_2):
244
+ normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
245
+ normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
246
+ return torch.matmul(
247
+ normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
248
+ )
249
+
250
+
251
+ print(similarity(english_embeddings, italian_embeddings))
252
+ print(similarity(english_embeddings, japanese_embeddings))
253
+ print(similarity(italian_embeddings, japanese_embeddings))
254
+ ```
255
+
256
+ ## Details
257
+
258
+ Details about data, training, evaluation and performance metrics are available in the [original paper](https://arxiv.org/abs/2007.01852).
259
+
260
+ ### BibTeX entry and citation info
261
+
262
+ ```bibtex
263
+ @misc{feng2020languageagnostic,
264
+ title={Language-agnostic BERT Sentence Embedding},
265
+ author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang},
266
+ year={2020},
267
+ eprint={2007.01852},
268
+ archivePrefix={arXiv},
269
+ primaryClass={cs.CL}
270
+ }
271
+ ```
config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "layer_norm_epsilon": 1e-12,
5
+ "unk_token": "[UNK]"
6
+ }
model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6552294a5f206d78186219707f4fa44c0f90c4c213c2d64f622ea173f5f37fee
3
+ size 941862060
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aab105881afc3a73d5c8445cdc5c0302b1c3efdecd71a1a34fa0cf4e5b7bf43
3
+ size 13631023
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
vocabulary.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06915cde8d140096af6bd7b84f8d776cbc6f3898f593a723fadb9d9f517f90c2
3
+ size 11942449
vocabulary.txt ADDED
The diff for this file is too large to render. See raw diff