spencer commited on
Commit
a05d4c0
1 Parent(s): 9c7dcc5

init commit

Browse files
README.md CHANGED
@@ -1,3 +1,31 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This model has been trained without supervision following the approach described in [Towards Unsupervised Dense Information Retrieval with Contrastive Learning](https://arxiv.org/abs/2112.09118). The associated GitHub repository is available here https://github.com/facebookresearch/contriever.
2
+
3
+ ## Usage (HuggingFace Transformers)
4
+ Using the model directly available in HuggingFace transformers requires to add a mean pooling operation to obtain a sentence embedding.
5
+
6
+ ```python
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModel
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
11
+ model = AutoModel.from_pretrained('facebook/contriever')
12
+
13
+ sentences = [
14
+ "Where was Marie Curie born?",
15
+ "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
16
+ "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
17
+ ]
18
+
19
+ # Apply tokenizer
20
+ inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
21
+
22
+ # Compute token embeddings
23
+ outputs = model(**inputs)
24
+
25
+ # Mean pooling
26
+ def mean_pooling(token_embeddings, mask):
27
+ token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
28
+ sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
29
+ return sentence_embeddings
30
+ embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
31
+ ```
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Contriever"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.15.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b6e2516913f812360475154323f50e2b1e1da0ce6a435175e08f218784c6fb
3
+ size 438007537
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}