Duplicate from flaubert/flaubert_base_uncased
Browse filesCo-authored-by: Hang Le <flaubert@users.noreply.huggingface.co>
- .gitattributes +9 -0
- README.md +98 -0
- config.json +77 -0
- merges.txt +0 -0
- pytorch_model.bin +3 -0
- tokenizer_config.json +4 -0
- vocab.json +0 -0
.gitattributes
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: fr
|
3 |
+
license: mit
|
4 |
+
datasets:
|
5 |
+
- flaubert
|
6 |
+
metrics:
|
7 |
+
- flue
|
8 |
+
tags:
|
9 |
+
- bert
|
10 |
+
- language-model
|
11 |
+
- flaubert
|
12 |
+
- flue
|
13 |
+
- french
|
14 |
+
- flaubert-base
|
15 |
+
- uncased
|
16 |
+
---
|
17 |
+
|
18 |
+
# FlauBERT: Unsupervised Language Model Pre-training for French
|
19 |
+
|
20 |
+
**FlauBERT** is a French BERT trained on a very large and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for Scientific Research) [Jean Zay](http://www.idris.fr/eng/jean-zay/ ) supercomputer.
|
21 |
+
|
22 |
+
Along with FlauBERT comes [**FLUE**](https://github.com/getalp/Flaubert/tree/master/flue): an evaluation setup for French NLP systems similar to the popular GLUE benchmark. The goal is to enable further reproducible experiments in the future and to share models and progress on the French language.For more details please refer to the [official website](https://github.com/getalp/Flaubert).
|
23 |
+
|
24 |
+
## FlauBERT models
|
25 |
+
|
26 |
+
| Model name | Number of layers | Attention Heads | Embedding Dimension | Total Parameters |
|
27 |
+
| :------: | :---: | :---: | :---: | :---: |
|
28 |
+
| `flaubert-small-cased` | 6 | 8 | 512 | 54 M |
|
29 |
+
| `flaubert-base-uncased` | 12 | 12 | 768 | 137 M |
|
30 |
+
| `flaubert-base-cased` | 12 | 12 | 768 | 138 M |
|
31 |
+
| `flaubert-large-cased` | 24 | 16 | 1024 | 373 M |
|
32 |
+
|
33 |
+
**Note:** `flaubert-small-cased` is partially trained so performance is not guaranteed. Consider using it for debugging purpose only.
|
34 |
+
|
35 |
+
## Using FlauBERT with Hugging Face's Transformers
|
36 |
+
|
37 |
+
```python
|
38 |
+
import torch
|
39 |
+
from transformers import FlaubertModel, FlaubertTokenizer
|
40 |
+
|
41 |
+
# Choose among ['flaubert/flaubert_small_cased', 'flaubert/flaubert_base_uncased',
|
42 |
+
# 'flaubert/flaubert_base_cased', 'flaubert/flaubert_large_cased']
|
43 |
+
modelname = 'flaubert/flaubert_base_cased'
|
44 |
+
|
45 |
+
# Load pretrained model and tokenizer
|
46 |
+
flaubert, log = FlaubertModel.from_pretrained(modelname, output_loading_info=True)
|
47 |
+
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=False)
|
48 |
+
# do_lowercase=False if using cased models, True if using uncased ones
|
49 |
+
|
50 |
+
sentence = "Le chat mange une pomme."
|
51 |
+
token_ids = torch.tensor([flaubert_tokenizer.encode(sentence)])
|
52 |
+
|
53 |
+
last_layer = flaubert(token_ids)[0]
|
54 |
+
print(last_layer.shape)
|
55 |
+
# torch.Size([1, 8, 768]) -> (batch size x number of tokens x embedding dimension)
|
56 |
+
|
57 |
+
# The BERT [CLS] token correspond to the first hidden state of the last layer
|
58 |
+
cls_embedding = last_layer[:, 0, :]
|
59 |
+
```
|
60 |
+
|
61 |
+
**Notes:** if your `transformers` version is <=2.10.0, `modelname` should take one
|
62 |
+
of the following values:
|
63 |
+
|
64 |
+
```
|
65 |
+
['flaubert-small-cased', 'flaubert-base-uncased', 'flaubert-base-cased', 'flaubert-large-cased']
|
66 |
+
```
|
67 |
+
|
68 |
+
|
69 |
+
## References
|
70 |
+
|
71 |
+
If you use FlauBERT or the FLUE Benchmark for your scientific publication, or if you find the resources in this repository useful, please cite one of the following papers:
|
72 |
+
|
73 |
+
[LREC paper](http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.302.pdf)
|
74 |
+
```
|
75 |
+
@InProceedings{le2020flaubert,
|
76 |
+
author = {Le, Hang and Vial, Lo\"{i}c and Frej, Jibril and Segonne, Vincent and Coavoux, Maximin and Lecouteux, Benjamin and Allauzen, Alexandre and Crabb\'{e}, Beno\^{i}t and Besacier, Laurent and Schwab, Didier},
|
77 |
+
title = {FlauBERT: Unsupervised Language Model Pre-training for French},
|
78 |
+
booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference},
|
79 |
+
month = {May},
|
80 |
+
year = {2020},
|
81 |
+
address = {Marseille, France},
|
82 |
+
publisher = {European Language Resources Association},
|
83 |
+
pages = {2479--2490},
|
84 |
+
url = {https://www.aclweb.org/anthology/2020.lrec-1.302}
|
85 |
+
}
|
86 |
+
```
|
87 |
+
|
88 |
+
[TALN paper](https://hal.archives-ouvertes.fr/hal-02784776/)
|
89 |
+
```
|
90 |
+
@inproceedings{le2020flaubert,
|
91 |
+
title = {FlauBERT: des mod{\`e}les de langue contextualis{\'e}s pr{\'e}-entra{\^\i}n{\'e}s pour le fran{\c{c}}ais},
|
92 |
+
author = {Le, Hang and Vial, Lo{\"\i}c and Frej, Jibril and Segonne, Vincent and Coavoux, Maximin and Lecouteux, Benjamin and Allauzen, Alexandre and Crabb{\'e}, Beno{\^\i}t and Besacier, Laurent and Schwab, Didier},
|
93 |
+
booktitle = {Actes de la 6e conf{\'e}rence conjointe Journ{\'e}es d'{\'E}tudes sur la Parole (JEP, 31e {\'e}dition), Traitement Automatique des Langues Naturelles (TALN, 27e {\'e}dition), Rencontre des {\'E}tudiants Chercheurs en Informatique pour le Traitement Automatique des Langues (R{\'E}CITAL, 22e {\'e}dition). Volume 2: Traitement Automatique des Langues Naturelles},
|
94 |
+
pages = {268--278},
|
95 |
+
year = {2020},
|
96 |
+
organization = {ATALA}
|
97 |
+
}
|
98 |
+
```
|
config.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"amp": 1,
|
3 |
+
"architectures": [
|
4 |
+
"FlaubertWithLMHeadModel"
|
5 |
+
],
|
6 |
+
"asm": false,
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"bos_index": 0,
|
9 |
+
"bos_token_id": 0,
|
10 |
+
"bptt": 512,
|
11 |
+
"causal": false,
|
12 |
+
"clip_grad_norm": 5,
|
13 |
+
"dropout": 0.1,
|
14 |
+
"emb_dim": 768,
|
15 |
+
"embed_init_std": 0.02209708691207961,
|
16 |
+
"encoder_only": true,
|
17 |
+
"end_n_top": 5,
|
18 |
+
"eos_index": 1,
|
19 |
+
"fp16": true,
|
20 |
+
"gelu_activation": true,
|
21 |
+
"group_by_size": true,
|
22 |
+
"id2lang": {
|
23 |
+
"0": "fr"
|
24 |
+
},
|
25 |
+
"init_std": 0.02,
|
26 |
+
"is_encoder": true,
|
27 |
+
"lang2id": {
|
28 |
+
"fr": 0
|
29 |
+
},
|
30 |
+
"lang_id": 0,
|
31 |
+
"langs": [
|
32 |
+
"fr"
|
33 |
+
],
|
34 |
+
"layer_norm_eps": 1e-12,
|
35 |
+
"layerdrop": 0.0,
|
36 |
+
"lg_sampling_factor": -1,
|
37 |
+
"lgs": "fr",
|
38 |
+
"mask_index": 5,
|
39 |
+
"mask_token_id": 0,
|
40 |
+
"max_batch_size": 0,
|
41 |
+
"max_position_embeddings": 512,
|
42 |
+
"max_vocab": -1,
|
43 |
+
"mlm_steps": [
|
44 |
+
[
|
45 |
+
"fr",
|
46 |
+
null
|
47 |
+
]
|
48 |
+
],
|
49 |
+
"model_type": "flaubert",
|
50 |
+
"n_heads": 12,
|
51 |
+
"n_langs": 1,
|
52 |
+
"n_layers": 12,
|
53 |
+
"pad_index": 2,
|
54 |
+
"pad_token_id": 2,
|
55 |
+
"pre_norm": false,
|
56 |
+
"sample_alpha": 0,
|
57 |
+
"share_inout_emb": true,
|
58 |
+
"sinusoidal_embeddings": false,
|
59 |
+
"start_n_top": 5,
|
60 |
+
"summary_activation": null,
|
61 |
+
"summary_first_dropout": 0.1,
|
62 |
+
"summary_proj_to_labels": true,
|
63 |
+
"summary_type": "first",
|
64 |
+
"summary_use_proj": true,
|
65 |
+
"tokens_per_batch": -1,
|
66 |
+
"unk_index": 3,
|
67 |
+
"use_lang_emb": true,
|
68 |
+
"vocab_size": 67542,
|
69 |
+
"word_blank": 0,
|
70 |
+
"word_dropout": 0,
|
71 |
+
"word_keep": 0.1,
|
72 |
+
"word_mask": 0.8,
|
73 |
+
"word_mask_keep_rand": "0.8,0.1,0.1",
|
74 |
+
"word_pred": 0.15,
|
75 |
+
"word_rand": 0.1,
|
76 |
+
"word_shuffle": 0
|
77 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92f2d4fab04253ac3976f43b47c86ebae5423fe0488ccdececfa27eec850687c
|
3 |
+
size 549587475
|
tokenizer_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_lowercase": true,
|
3 |
+
"do_lower_case": true
|
4 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|