antoinelouis commited on
Commit
1fbb475
1 Parent(s): 3c3ed5d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-base",
3
+ "architectures": [
4
+ "MT5EncoderForSequenceClassification"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "id2label": {
16
+ "0": "LABEL_0"
17
+ },
18
+ "initializer_factor": 1.0,
19
+ "is_encoder_decoder": false,
20
+ "is_gated_act": true,
21
+ "label2id": {
22
+ "LABEL_0": 0
23
+ },
24
+ "layer_norm_epsilon": 1e-06,
25
+ "model_type": "mt5",
26
+ "num_decoder_layers": null,
27
+ "num_heads": 12,
28
+ "num_layers": 12,
29
+ "output_past": true,
30
+ "pad_token_id": 0,
31
+ "pooling_mode": "mean",
32
+ "relative_attention_max_distance": 128,
33
+ "relative_attention_num_buckets": 32,
34
+ "tie_word_embeddings": false,
35
+ "tokenizer_class": "T5Tokenizer",
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.36.2",
38
+ "use_cache": false,
39
+ "vocab_size": 250112
40
+ }
mmarco_smalldev_scores.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ epoch,steps,cutoff,mrr,recall,r-precision
2
+ 0,20000,5,0.2699546322827125,0.42583572110792745,0.17494030563514806
3
+ 0,20000,10,0.28493837267476235,0.5348018147086915,0.17494030563514806
4
+ 0,20000,20,0.2916781083810777,0.6319126074498568,0.17494030563514806
5
+ 0,20000,50,0.29533158523611325,0.744878223495702,0.17494030563514806
6
+ 0,20000,100,0.2963288717867327,0.8173352435530086,0.17494030563514806
7
+ 0,20000,500,0.2970346866747076,0.9555157593123209,0.17494030563514806
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fb8927f32655411e3ecb62e140889e3ff1658c945d34394b9c0236d3a66dd7
3
+ size 1110540068
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "<pad>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
t5.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from typing import Optional, Union, Tuple
4
+
5
+ from transformers.modeling_outputs import SequenceClassifierOutput
6
+ from transformers.models.t5.modeling_t5 import T5Config, T5ClassificationHead, T5EncoderModel
7
+ from transformers.models.mt5.modeling_mt5 import MT5Config, MT5ClassificationHead, MT5EncoderModel
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
9
+
10
+
11
+ def clean_t5_config(config: Union[T5Config, MT5Config], model_type: str):
12
+ assert model_type in ['t5', 'mt5']
13
+ setattr(config, 'pooling_mode', 'mean')
14
+ setattr(config, 'model_type', model_type)
15
+ setattr(config, 'use_cache', False)
16
+ setattr(config, 'is_encoder_decoder', False)
17
+ setattr(config, 'num_decoder_layers', None)
18
+ delattr(config, 'task_specific_params') if hasattr(config, 'task_specific_params') else None
19
+
20
+
21
+ class T5EncoderForSequenceClassification(T5EncoderModel):
22
+ """
23
+ T5 encoder for sequence classification tasks.
24
+
25
+ :param config: The T5 configuration object.
26
+ """
27
+ def __init__(self, config: T5Config):
28
+ super().__init__(config)
29
+ self.pool_layer = PoolLayer(config.pooling_mode)
30
+ self.classification_head = T5ClassificationHead(config)
31
+
32
+ def forward(
33
+ self,
34
+ input_ids: Optional[torch.LongTensor] = None,
35
+ attention_mask: Optional[torch.FloatTensor] = None,
36
+ head_mask: Optional[torch.FloatTensor] = None,
37
+ inputs_embeds: Optional[torch.FloatTensor] = None,
38
+ labels: Optional[torch.LongTensor] = None,
39
+ output_attentions: Optional[bool] = None,
40
+ output_hidden_states: Optional[bool] = None,
41
+ return_dict: Optional[bool] = None,
42
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
43
+ """
44
+ Forward pass of the T5 encoder for sequence classification tasks.
45
+
46
+ :param input_ids: The input token IDs.
47
+ :param attention_mask: The attention mask.
48
+ :param head_mask: The head mask.
49
+ :param inputs_embeds: The input embeddings.
50
+ :param labels: The target labels.
51
+ :param output_attentions: Whether to output attentions.
52
+ :param output_hidden_states: Whether to output hidden states.
53
+ :param return_dict: Whether to return a dictionary.
54
+ :returns: The logits for the classification task or a dictionary containing the outputs.
55
+ """
56
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
57
+ loss = None
58
+
59
+ outputs = self.encoder(
60
+ input_ids=input_ids,
61
+ attention_mask=attention_mask,
62
+ inputs_embeds=inputs_embeds,
63
+ head_mask=head_mask,
64
+ output_attentions=output_attentions,
65
+ output_hidden_states=output_hidden_states,
66
+ return_dict=return_dict,
67
+ )
68
+ sequence_output = self.pool_layer(outputs.last_hidden_state, attention_mask)
69
+ logits = self.classification_head(sequence_output)
70
+
71
+ if not return_dict:
72
+ output = (logits,) + outputs[2:]
73
+ return ((loss,) + output) if loss is not None else output
74
+
75
+ return SequenceClassifierOutput(
76
+ loss=loss,
77
+ logits=logits,
78
+ hidden_states=outputs.hidden_states,
79
+ attentions=outputs.attentions,
80
+ )
81
+
82
+
83
+ class MT5EncoderForSequenceClassification(MT5EncoderModel):
84
+ """
85
+ mT5 encoder for sequence classification tasks.
86
+
87
+ :param config: The mT5 configuration object.
88
+ """
89
+ def __init__(self, config: MT5Config):
90
+ super().__init__(config)
91
+ self.pool_layer = PoolLayer(config.pooling_mode)
92
+ self.classification_head = MT5ClassificationHead(config)
93
+
94
+ def forward(
95
+ self,
96
+ input_ids: Optional[torch.LongTensor] = None,
97
+ attention_mask: Optional[torch.FloatTensor] = None,
98
+ head_mask: Optional[torch.FloatTensor] = None,
99
+ inputs_embeds: Optional[torch.FloatTensor] = None,
100
+ labels: Optional[torch.LongTensor] = None,
101
+ output_attentions: Optional[bool] = None,
102
+ output_hidden_states: Optional[bool] = None,
103
+ return_dict: Optional[bool] = None,
104
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
105
+ """
106
+ Forward pass of the mT5 encoder for sequence classification tasks.
107
+
108
+ :param input_ids: The input token IDs.
109
+ :param attention_mask: The attention mask.
110
+ :param head_mask: The head mask.
111
+ :param inputs_embeds: The input embeddings.
112
+ :param labels: The target labels.
113
+ :param output_attentions: Whether to output attentions.
114
+ :param output_hidden_states: Whether to output hidden states.
115
+ :param return_dict: Whether to return a dictionary.
116
+ :returns: The logits for the classification task or a dictionary containing the outputs.
117
+ """
118
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
119
+ loss = None
120
+
121
+ outputs = self.encoder(
122
+ input_ids=input_ids,
123
+ attention_mask=attention_mask,
124
+ inputs_embeds=inputs_embeds,
125
+ head_mask=head_mask,
126
+ output_attentions=output_attentions,
127
+ output_hidden_states=output_hidden_states,
128
+ return_dict=return_dict,
129
+ )
130
+ sequence_output = self.pool_layer(outputs.last_hidden_state, attention_mask)
131
+ logits = self.classification_head(sequence_output)
132
+
133
+ if not return_dict:
134
+ output = (logits,) + outputs[2:]
135
+ return ((loss,) + output) if loss is not None else output
136
+
137
+ return SequenceClassifierOutput(
138
+ loss=loss,
139
+ logits=logits,
140
+ hidden_states=outputs.hidden_states,
141
+ attentions=outputs.attentions,
142
+ )
143
+
144
+
145
+ class PoolLayer(nn.Module):
146
+ """
147
+ Pooling layer on top of the commputed token embeddings.
148
+
149
+ :param pooling_mode: The pooling strategy to use.
150
+ """
151
+ def __init__(self, pooling_mode: str):
152
+ super().__init__()
153
+ assert pooling_mode in ['first', 'mean', 'max'], f"ERROR: Unknown pooling strategy '{pooling_mode}'"
154
+ self.pooling_mode = pooling_mode
155
+
156
+ def forward(self, token_embeddings: torch.Tensor, attention_masks: torch.Tensor) -> torch.Tensor:
157
+ """
158
+ Compute the passage vector by pooling the token embeddings.
159
+
160
+ :param token_embeddings: A 3D tensor of size [batch_size, seq_len, d_model].
161
+ :param attention_masks: A 2D tensor of size [batch_size, seq_len].
162
+ :returns: A 2D tensor of size [batch_size, d_model].
163
+ """
164
+ if self.pooling_mode == 'first':
165
+ text_vectors = token_embeddings[:, 0, :]
166
+ elif self.pooling_mode == 'max':
167
+ # Set all values of the [PAD] embeddings to large negative values (so that they are never considered as maximum for a channel).
168
+ attention_masks_expanded = attention_masks.unsqueeze(-1).expand(token_embeddings.size())
169
+ token_embeddings[attention_masks_expanded == 0] = -1e+9 if token_embeddings.dtype == torch.float32 else -1e+4
170
+ # Compute the maxima along the 'seq_length' dimension (-> Tensor[batch_size, d_model]).
171
+ text_vectors = torch.max(token_embeddings, dim=1).values
172
+ else:
173
+ # Set all values of the [PAD] embeddings to zeros (so that they are not taken into account in the sum for a channel).
174
+ attention_masks_expanded = attention_masks.unsqueeze(-1).expand(token_embeddings.size())
175
+ token_embeddings[attention_masks_expanded == 0] = 0.0
176
+ # Compute the means by first summing along the 'seq_length' dimension (-> Tensor[batch_size, d_model]).
177
+ sum_embeddings = torch.sum(token_embeddings, dim=1)
178
+ # Then, divide all values of a passage vector by the original passage length.
179
+ sum_mask = attention_masks_expanded.sum(dim=1) # -> Tensor[batch_size, d_model] where each value is the length of the corresponding passage.
180
+ sum_mask = torch.clamp(sum_mask, min=1e-7) # Make sure not to have zeros by lower bounding all elements to 1e-7.
181
+ text_vectors = sum_embeddings / sum_mask # Divide each dimension by the sequence length.
182
+ return text_vectors
183
+
184
+
185
+ if __name__ == "__main__":
186
+ config = AutoConfig.from_pretrained(model_name)
187
+ if isinstance(config, T5Config):
188
+ clean_t5_config(self.config, model_type='t5')
189
+ model = T5EncoderForSequenceClassification.from_pretrained(model_name, config=config)
190
+ elif isinstance(config, MT5Config):
191
+ clean_t5_config(self.config, model_type='t5')
192
+ model = MT5EncoderForSequenceClassification.from_pretrained(model_name, config=config)
193
+ else:
194
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
195
+ tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_args)
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1ae0f5ccb75b6fbb45cda86ea3c1cc20beb32758bebc9629efc9207c8b08f06
3
+ size 16330714
tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "</s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<unk>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [],
29
+ "clean_up_tokenization_spaces": true,
30
+ "eos_token": "</s>",
31
+ "extra_ids": 0,
32
+ "legacy": true,
33
+ "model_max_length": 256,
34
+ "pad_token": "<pad>",
35
+ "sp_model_kwargs": {},
36
+ "tokenizer_class": "T5Tokenizer",
37
+ "unk_token": "<unk>"
38
+ }