pere commited on
Commit
34b9ad7
·
verified ·
1 Parent(s): 0bff72a

Saving weights and logs of step 2500

Browse files
.run_train.sh.un~ ADDED
Binary file (4.3 kB). View file
 
.train.sh.un~ ADDED
Binary file (4.46 kB). View file
 
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./",
3
+ "architectures": [
4
+ "RobertaForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "transformers_version": "4.47.0",
23
+ "type_vocab_size": 1,
24
+ "use_cache": true,
25
+ "vocab_size": 50265
26
+ }
create_config.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from transformers import RobertaConfig
2
+
3
+ config = RobertaConfig.from_pretrained("FacebookAI/roberta-base", vocab_size=50265)
4
+ config.save_pretrained("./")
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d80fa20f4f3dad1c33ea01df50e4cd9712831720e6be57166143d745b1d9c6cb
3
+ size 498796983
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
run_mlm_flax.py ADDED
@@ -0,0 +1,925 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Team All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
18
+ text file or a dataset.
19
+
20
+ Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
21
+ https://huggingface.co/models?filter=fill-mask
22
+ """
23
+
24
+ import json
25
+ import logging
26
+ import math
27
+ import os
28
+ import sys
29
+ import time
30
+ from dataclasses import asdict, dataclass, field
31
+ from enum import Enum
32
+ from itertools import chain
33
+
34
+ # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
35
+ from pathlib import Path
36
+ from typing import Dict, List, Optional, Tuple
37
+
38
+ import flax
39
+ import jax
40
+ import jax.numpy as jnp
41
+ import numpy as np
42
+ import optax
43
+ from datasets import load_dataset
44
+ from flax import jax_utils, traverse_util
45
+ from flax.jax_utils import pad_shard_unpad
46
+ from flax.training import train_state
47
+ from flax.training.common_utils import get_metrics, onehot, shard
48
+ from huggingface_hub import HfApi
49
+ from tqdm import tqdm
50
+
51
+ from transformers import (
52
+ CONFIG_MAPPING,
53
+ FLAX_MODEL_FOR_MASKED_LM_MAPPING,
54
+ AutoConfig,
55
+ AutoTokenizer,
56
+ FlaxAutoModelForMaskedLM,
57
+ HfArgumentParser,
58
+ PreTrainedTokenizerBase,
59
+ TensorType,
60
+ is_tensorboard_available,
61
+ set_seed,
62
+ )
63
+ from transformers.utils import send_example_telemetry
64
+
65
+
66
+ MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
67
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
68
+
69
+
70
+ @dataclass
71
+ class TrainingArguments:
72
+ output_dir: str = field(
73
+ metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
74
+ )
75
+ overwrite_output_dir: bool = field(
76
+ default=False,
77
+ metadata={
78
+ "help": (
79
+ "Overwrite the content of the output directory. "
80
+ "Use this to continue training if output_dir points to a checkpoint directory."
81
+ )
82
+ },
83
+ )
84
+ do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
85
+ do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
86
+ per_device_train_batch_size: int = field(
87
+ default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
88
+ )
89
+ per_device_eval_batch_size: int = field(
90
+ default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
91
+ )
92
+ learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
93
+ weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
94
+ adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
95
+ adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
96
+ adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
97
+ adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
98
+ num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
99
+ warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
100
+ logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
101
+ save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
102
+ eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
103
+ seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
104
+ push_to_hub: bool = field(
105
+ default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
106
+ )
107
+ hub_model_id: str = field(
108
+ default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
109
+ )
110
+ hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
111
+ gradient_checkpointing: bool = field(
112
+ default=False,
113
+ metadata={
114
+ "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
115
+ },
116
+ )
117
+
118
+ def __post_init__(self):
119
+ if self.output_dir is not None:
120
+ self.output_dir = os.path.expanduser(self.output_dir)
121
+
122
+ def to_dict(self):
123
+ """
124
+ Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
125
+ the token values by removing their value.
126
+ """
127
+ d = asdict(self)
128
+ for k, v in d.items():
129
+ if isinstance(v, Enum):
130
+ d[k] = v.value
131
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
132
+ d[k] = [x.value for x in v]
133
+ if k.endswith("_token"):
134
+ d[k] = f"<{k.upper()}>"
135
+ return d
136
+
137
+
138
+ @dataclass
139
+ class ModelArguments:
140
+ """
141
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
142
+ """
143
+
144
+ model_name_or_path: Optional[str] = field(
145
+ default=None,
146
+ metadata={
147
+ "help": (
148
+ "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
149
+ )
150
+ },
151
+ )
152
+ model_type: Optional[str] = field(
153
+ default=None,
154
+ metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
155
+ )
156
+ config_name: Optional[str] = field(
157
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
158
+ )
159
+ tokenizer_name: Optional[str] = field(
160
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
161
+ )
162
+ cache_dir: Optional[str] = field(
163
+ default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
164
+ )
165
+ use_fast_tokenizer: bool = field(
166
+ default=True,
167
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
168
+ )
169
+ dtype: Optional[str] = field(
170
+ default="float32",
171
+ metadata={
172
+ "help": (
173
+ "Floating-point format in which the model weights should be initialized and trained. Choose one of"
174
+ " `[float32, float16, bfloat16]`."
175
+ )
176
+ },
177
+ )
178
+ token: str = field(
179
+ default=None,
180
+ metadata={
181
+ "help": (
182
+ "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
183
+ "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
184
+ )
185
+ },
186
+ )
187
+ trust_remote_code: bool = field(
188
+ default=False,
189
+ metadata={
190
+ "help": (
191
+ "Whether to trust the execution of code from datasets/models defined on the Hub."
192
+ " This option should only be set to `True` for repositories you trust and in which you have read the"
193
+ " code, as it will execute code present on the Hub on your local machine."
194
+ )
195
+ },
196
+ )
197
+
198
+
199
+ @dataclass
200
+ class DataTrainingArguments:
201
+ """
202
+ Arguments pertaining to what data we are going to input our model for training and eval.
203
+ """
204
+
205
+ dataset_name: Optional[str] = field(
206
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
207
+ )
208
+ dataset_config_name: Optional[str] = field(
209
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
210
+ )
211
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
212
+ validation_file: Optional[str] = field(
213
+ default=None,
214
+ metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
215
+ )
216
+ train_ref_file: Optional[str] = field(
217
+ default=None,
218
+ metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
219
+ )
220
+ validation_ref_file: Optional[str] = field(
221
+ default=None,
222
+ metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
223
+ )
224
+ overwrite_cache: bool = field(
225
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
226
+ )
227
+ validation_split_percentage: Optional[int] = field(
228
+ default=5,
229
+ metadata={
230
+ "help": "The percentage of the train set used as validation set in case there's no validation split"
231
+ },
232
+ )
233
+ max_seq_length: Optional[int] = field(
234
+ default=None,
235
+ metadata={
236
+ "help": (
237
+ "The maximum total input sequence length after tokenization. Sequences longer "
238
+ "than this will be truncated. Default to the max input length of the model."
239
+ )
240
+ },
241
+ )
242
+ preprocessing_num_workers: Optional[int] = field(
243
+ default=None,
244
+ metadata={"help": "The number of processes to use for the preprocessing."},
245
+ )
246
+ mlm_probability: float = field(
247
+ default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
248
+ )
249
+ pad_to_max_length: bool = field(
250
+ default=False,
251
+ metadata={
252
+ "help": (
253
+ "Whether to pad all samples to `max_seq_length`. "
254
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch."
255
+ )
256
+ },
257
+ )
258
+ line_by_line: bool = field(
259
+ default=False,
260
+ metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
261
+ )
262
+
263
+ def __post_init__(self):
264
+ if self.dataset_name is None and self.train_file is None and self.validation_file is None:
265
+ raise ValueError("Need either a dataset name or a training/validation file.")
266
+ else:
267
+ if self.train_file is not None:
268
+ extension = self.train_file.split(".")[-1]
269
+ assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
270
+ if self.validation_file is not None:
271
+ extension = self.validation_file.split(".")[-1]
272
+ assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
273
+
274
+
275
+ @flax.struct.dataclass
276
+ class FlaxDataCollatorForLanguageModeling:
277
+ """
278
+ Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
279
+ are not all of the same length.
280
+
281
+ Args:
282
+ tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
283
+ The tokenizer used for encoding the data.
284
+ mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
285
+ The probability with which to (randomly) mask tokens in the input.
286
+
287
+ .. note::
288
+
289
+ For best performance, this data collator should be used with a dataset having items that are dictionaries or
290
+ BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
291
+ :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
292
+ argument :obj:`return_special_tokens_mask=True`.
293
+ """
294
+
295
+ tokenizer: PreTrainedTokenizerBase
296
+ mlm_probability: float = 0.15
297
+
298
+ def __post_init__(self):
299
+ if self.tokenizer.mask_token is None:
300
+ raise ValueError(
301
+ "This tokenizer does not have a mask token which is necessary for masked language modeling. "
302
+ "You should pass `mlm=False` to train on causal language modeling instead."
303
+ )
304
+
305
+ def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
306
+ # Handle dict or lists with proper padding and conversion to tensor.
307
+ batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
308
+
309
+ # If special token mask has been preprocessed, pop it from the dict.
310
+ special_tokens_mask = batch.pop("special_tokens_mask", None)
311
+
312
+ batch["input_ids"], batch["labels"] = self.mask_tokens(
313
+ batch["input_ids"], special_tokens_mask=special_tokens_mask
314
+ )
315
+ return batch
316
+
317
+ def mask_tokens(
318
+ self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
319
+ ) -> Tuple[np.ndarray, np.ndarray]:
320
+ """
321
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
322
+ """
323
+ labels = inputs.copy()
324
+ # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
325
+ probability_matrix = np.full(labels.shape, self.mlm_probability)
326
+ special_tokens_mask = special_tokens_mask.astype("bool")
327
+
328
+ probability_matrix[special_tokens_mask] = 0.0
329
+ masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
330
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
331
+
332
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
333
+ indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
334
+ inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
335
+
336
+ # 10% of the time, we replace masked input tokens with random word
337
+ indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
338
+ indices_random &= masked_indices & ~indices_replaced
339
+
340
+ random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
341
+ inputs[indices_random] = random_words[indices_random]
342
+
343
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
344
+ return inputs, labels
345
+
346
+
347
+ def generate_batch_splits(samples_idx: np.ndarray, batch_size: int, drop_last=True) -> np.ndarray:
348
+ """Generate batches of data for a specified batch size from sample indices. If the dataset size is not divisible by
349
+ the batch size and `drop_last` is `True`, the last incomplete batch is dropped. Else, it is returned."""
350
+ num_samples = len(samples_idx)
351
+ if drop_last:
352
+ samples_to_remove = num_samples % batch_size
353
+ if samples_to_remove != 0:
354
+ samples_idx = samples_idx[:-samples_to_remove]
355
+ sections_split = num_samples // batch_size
356
+ samples_idx = samples_idx.reshape((sections_split, batch_size))
357
+ else:
358
+ sections_split = math.ceil(num_samples / batch_size)
359
+ samples_idx = np.array_split(samples_idx, sections_split)
360
+ return samples_idx
361
+
362
+
363
+ def write_train_metric(summary_writer, train_metrics, train_time, step):
364
+ summary_writer.scalar("train_time", train_time, step)
365
+
366
+ train_metrics = get_metrics(train_metrics)
367
+ for key, vals in train_metrics.items():
368
+ tag = f"train_{key}"
369
+ for i, val in enumerate(vals):
370
+ summary_writer.scalar(tag, val, step - len(vals) + i + 1)
371
+
372
+
373
+ def write_eval_metric(summary_writer, eval_metrics, step):
374
+ for metric_name, value in eval_metrics.items():
375
+ summary_writer.scalar(f"eval_{metric_name}", value, step)
376
+
377
+
378
+ def main():
379
+ # See all possible arguments in src/transformers/training_args.py
380
+ # or by passing the --help flag to this script.
381
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
382
+
383
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
384
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
385
+ # If we pass only one argument to the script and it's the path to a json file,
386
+ # let's parse it to get our arguments.
387
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
388
+ else:
389
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
390
+
391
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
392
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
393
+ send_example_telemetry("run_mlm", model_args, data_args, framework="flax")
394
+
395
+ if (
396
+ os.path.exists(training_args.output_dir)
397
+ and os.listdir(training_args.output_dir)
398
+ and training_args.do_train
399
+ and not training_args.overwrite_output_dir
400
+ ):
401
+ raise ValueError(
402
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
403
+ "Use --overwrite_output_dir to overcome."
404
+ )
405
+
406
+ # Setup logging
407
+ logging.basicConfig(
408
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
409
+ level=logging.INFO,
410
+ datefmt="[%X]",
411
+ )
412
+
413
+ # Log on each process the small summary:
414
+ logger = logging.getLogger(__name__)
415
+
416
+ # Set the verbosity to info of the Transformers logger (on main process only):
417
+ logger.info(f"Training/evaluation parameters {training_args}")
418
+
419
+ # Set seed before initializing model.
420
+ set_seed(training_args.seed)
421
+
422
+ # Handle the repository creation
423
+ if training_args.push_to_hub:
424
+ # Retrieve of infer repo_name
425
+ repo_name = training_args.hub_model_id
426
+ if repo_name is None:
427
+ repo_name = Path(training_args.output_dir).absolute().name
428
+ # Create repo and retrieve repo_id
429
+ api = HfApi()
430
+ repo_id = api.create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
431
+
432
+ # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
433
+ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
434
+ # (the dataset will be downloaded automatically from the datasets Hub).
435
+ #
436
+ # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
437
+ # 'text' is found. You can easily tweak this behavior (see below).
438
+ #
439
+ # In distributed training, the load_dataset function guarantees that only one local process can concurrently
440
+ # download the dataset.
441
+ if data_args.dataset_name is not None:
442
+ # Downloading and loading a dataset from the hub.
443
+ datasets = load_dataset(
444
+ data_args.dataset_name,
445
+ data_args.dataset_config_name,
446
+ cache_dir=model_args.cache_dir,
447
+ token=model_args.token,
448
+ num_proc=data_args.preprocessing_num_workers,
449
+ trust_remote_code=model_args.trust_remote_code,
450
+ )
451
+
452
+ if "validation" not in datasets.keys():
453
+ datasets["validation"] = load_dataset(
454
+ data_args.dataset_name,
455
+ data_args.dataset_config_name,
456
+ split=f"train[:{data_args.validation_split_percentage}%]",
457
+ cache_dir=model_args.cache_dir,
458
+ token=model_args.token,
459
+ num_proc=data_args.preprocessing_num_workers,
460
+ trust_remote_code=model_args.trust_remote_code,
461
+ )
462
+ datasets["train"] = load_dataset(
463
+ data_args.dataset_name,
464
+ data_args.dataset_config_name,
465
+ split=f"train[{data_args.validation_split_percentage}%:]",
466
+ cache_dir=model_args.cache_dir,
467
+ token=model_args.token,
468
+ num_proc=data_args.preprocessing_num_workers,
469
+ trust_remote_code=model_args.trust_remote_code,
470
+ )
471
+ else:
472
+ data_files = {}
473
+ if data_args.train_file is not None:
474
+ data_files["train"] = data_args.train_file
475
+ extension = data_args.train_file.split(".")[-1]
476
+ if data_args.validation_file is not None:
477
+ data_files["validation"] = data_args.validation_file
478
+ extension = data_args.validation_file.split(".")[-1]
479
+ if extension == "txt":
480
+ extension = "text"
481
+ datasets = load_dataset(
482
+ extension,
483
+ data_files=data_files,
484
+ cache_dir=model_args.cache_dir,
485
+ token=model_args.token,
486
+ num_proc=data_args.preprocessing_num_workers,
487
+ )
488
+
489
+ if "validation" not in datasets.keys():
490
+ datasets["validation"] = load_dataset(
491
+ extension,
492
+ data_files=data_files,
493
+ split=f"train[:{data_args.validation_split_percentage}%]",
494
+ cache_dir=model_args.cache_dir,
495
+ token=model_args.token,
496
+ num_proc=data_args.preprocessing_num_workers,
497
+ )
498
+ datasets["train"] = load_dataset(
499
+ extension,
500
+ data_files=data_files,
501
+ split=f"train[{data_args.validation_split_percentage}%:]",
502
+ cache_dir=model_args.cache_dir,
503
+ token=model_args.token,
504
+ num_proc=data_args.preprocessing_num_workers,
505
+ )
506
+ # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
507
+ # https://huggingface.co/docs/datasets/loading_datasets.
508
+
509
+ # Load pretrained model and tokenizer
510
+
511
+ # Distributed training:
512
+ # The .from_pretrained methods guarantee that only one local process can concurrently
513
+ # download model & vocab.
514
+ if model_args.config_name:
515
+ config = AutoConfig.from_pretrained(
516
+ model_args.config_name,
517
+ cache_dir=model_args.cache_dir,
518
+ token=model_args.token,
519
+ trust_remote_code=model_args.trust_remote_code,
520
+ )
521
+ elif model_args.model_name_or_path:
522
+ config = AutoConfig.from_pretrained(
523
+ model_args.model_name_or_path,
524
+ cache_dir=model_args.cache_dir,
525
+ token=model_args.token,
526
+ trust_remote_code=model_args.trust_remote_code,
527
+ )
528
+ else:
529
+ config = CONFIG_MAPPING[model_args.model_type]()
530
+ logger.warning("You are instantiating a new config instance from scratch.")
531
+
532
+ if model_args.tokenizer_name:
533
+ tokenizer = AutoTokenizer.from_pretrained(
534
+ model_args.tokenizer_name,
535
+ cache_dir=model_args.cache_dir,
536
+ use_fast=model_args.use_fast_tokenizer,
537
+ token=model_args.token,
538
+ trust_remote_code=model_args.trust_remote_code,
539
+ )
540
+ elif model_args.model_name_or_path:
541
+ tokenizer = AutoTokenizer.from_pretrained(
542
+ model_args.model_name_or_path,
543
+ cache_dir=model_args.cache_dir,
544
+ use_fast=model_args.use_fast_tokenizer,
545
+ token=model_args.token,
546
+ trust_remote_code=model_args.trust_remote_code,
547
+ )
548
+ else:
549
+ raise ValueError(
550
+ "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
551
+ "You can do it from another script, save it, and load it from here, using --tokenizer_name."
552
+ )
553
+
554
+ # Preprocessing the datasets.
555
+ # First we tokenize all the texts.
556
+ if training_args.do_train:
557
+ column_names = datasets["train"].column_names
558
+ else:
559
+ column_names = datasets["validation"].column_names
560
+ text_column_name = "text" if "text" in column_names else column_names[0]
561
+
562
+ max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
563
+
564
+ if data_args.line_by_line:
565
+ # When using line_by_line, we just tokenize each nonempty line.
566
+ padding = "max_length" if data_args.pad_to_max_length else False
567
+
568
+ def tokenize_function(examples):
569
+ # Remove empty lines
570
+ examples = [line for line in examples if len(line) > 0 and not line.isspace()]
571
+ return tokenizer(
572
+ examples,
573
+ return_special_tokens_mask=True,
574
+ padding=padding,
575
+ truncation=True,
576
+ max_length=max_seq_length,
577
+ )
578
+
579
+ tokenized_datasets = datasets.map(
580
+ tokenize_function,
581
+ input_columns=[text_column_name],
582
+ batched=True,
583
+ num_proc=data_args.preprocessing_num_workers,
584
+ remove_columns=column_names,
585
+ load_from_cache_file=not data_args.overwrite_cache,
586
+ )
587
+
588
+ else:
589
+ # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
590
+ # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
591
+ # efficient when it receives the `special_tokens_mask`.
592
+ def tokenize_function(examples):
593
+ return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
594
+
595
+ tokenized_datasets = datasets.map(
596
+ tokenize_function,
597
+ batched=True,
598
+ num_proc=data_args.preprocessing_num_workers,
599
+ remove_columns=column_names,
600
+ load_from_cache_file=not data_args.overwrite_cache,
601
+ )
602
+
603
+ # Main data processing function that will concatenate all texts from our dataset and generate chunks of
604
+ # max_seq_length.
605
+ def group_texts(examples):
606
+ # Concatenate all texts.
607
+ concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
608
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
609
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
610
+ # customize this part to your needs.
611
+ if total_length >= max_seq_length:
612
+ total_length = (total_length // max_seq_length) * max_seq_length
613
+ # Split by chunks of max_len.
614
+ result = {
615
+ k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
616
+ for k, t in concatenated_examples.items()
617
+ }
618
+ return result
619
+
620
+ # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
621
+ # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
622
+ # might be slower to preprocess.
623
+ #
624
+ # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
625
+ # https://huggingface.co/docs/datasets/process#map
626
+ tokenized_datasets = tokenized_datasets.map(
627
+ group_texts,
628
+ batched=True,
629
+ num_proc=data_args.preprocessing_num_workers,
630
+ load_from_cache_file=not data_args.overwrite_cache,
631
+ )
632
+
633
+ # Enable tensorboard only on the master node
634
+ has_tensorboard = is_tensorboard_available()
635
+ if has_tensorboard and jax.process_index() == 0:
636
+ try:
637
+ from flax.metrics.tensorboard import SummaryWriter
638
+
639
+ summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
640
+ except ImportError as ie:
641
+ has_tensorboard = False
642
+ logger.warning(
643
+ f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
644
+ )
645
+ else:
646
+ logger.warning(
647
+ "Unable to display metrics through TensorBoard because the package is not installed: "
648
+ "Please run pip install tensorboard to enable."
649
+ )
650
+
651
+ # Data collator
652
+ # This one will take care of randomly masking the tokens.
653
+ data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
654
+
655
+ # Initialize our training
656
+ rng = jax.random.PRNGKey(training_args.seed)
657
+ dropout_rngs = jax.random.split(rng, jax.local_device_count())
658
+
659
+ if model_args.model_name_or_path:
660
+ model = FlaxAutoModelForMaskedLM.from_pretrained(
661
+ model_args.model_name_or_path,
662
+ config=config,
663
+ seed=training_args.seed,
664
+ dtype=getattr(jnp, model_args.dtype),
665
+ token=model_args.token,
666
+ trust_remote_code=model_args.trust_remote_code,
667
+ )
668
+ else:
669
+ model = FlaxAutoModelForMaskedLM.from_config(
670
+ config,
671
+ seed=training_args.seed,
672
+ dtype=getattr(jnp, model_args.dtype),
673
+ trust_remote_code=model_args.trust_remote_code,
674
+ )
675
+
676
+ if training_args.gradient_checkpointing:
677
+ model.enable_gradient_checkpointing()
678
+
679
+ # Store some constant
680
+ num_epochs = int(training_args.num_train_epochs)
681
+ train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
682
+ per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
683
+ eval_batch_size = per_device_eval_batch_size * jax.device_count()
684
+
685
+ num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
686
+
687
+ # Create learning rate schedule
688
+ warmup_fn = optax.linear_schedule(
689
+ init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
690
+ )
691
+ decay_fn = optax.linear_schedule(
692
+ init_value=training_args.learning_rate,
693
+ end_value=0,
694
+ transition_steps=num_train_steps - training_args.warmup_steps,
695
+ )
696
+ linear_decay_lr_schedule_fn = optax.join_schedules(
697
+ schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
698
+ )
699
+
700
+ # We use Optax's "masking" functionality to not apply weight decay
701
+ # to bias and LayerNorm scale parameters. decay_mask_fn returns a
702
+ # mask boolean with the same structure as the parameters.
703
+ # The mask is True for parameters that should be decayed.
704
+ def decay_mask_fn(params):
705
+ flat_params = traverse_util.flatten_dict(params)
706
+ # find out all LayerNorm parameters
707
+ layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
708
+ layer_norm_named_params = {
709
+ layer[-2:]
710
+ for layer_norm_name in layer_norm_candidates
711
+ for layer in flat_params.keys()
712
+ if layer_norm_name in "".join(layer).lower()
713
+ }
714
+ flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
715
+ return traverse_util.unflatten_dict(flat_mask)
716
+
717
+ # create adam optimizer
718
+ if training_args.adafactor:
719
+ # We use the default parameters here to initialize adafactor,
720
+ # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
721
+ optimizer = optax.adafactor(
722
+ learning_rate=linear_decay_lr_schedule_fn,
723
+ )
724
+ else:
725
+ optimizer = optax.adamw(
726
+ learning_rate=linear_decay_lr_schedule_fn,
727
+ b1=training_args.adam_beta1,
728
+ b2=training_args.adam_beta2,
729
+ eps=training_args.adam_epsilon,
730
+ weight_decay=training_args.weight_decay,
731
+ mask=decay_mask_fn,
732
+ )
733
+
734
+ # Setup train state
735
+ state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
736
+
737
+ # Define gradient update step fn
738
+ def train_step(state, batch, dropout_rng):
739
+ dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
740
+
741
+ def loss_fn(params):
742
+ labels = batch.pop("labels")
743
+
744
+ logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
745
+
746
+ # compute loss, ignore padded input tokens
747
+ label_mask = jnp.where(labels > 0, 1.0, 0.0)
748
+ loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
749
+
750
+ # take average
751
+ loss = loss.sum()
752
+ num_labels = label_mask.sum()
753
+
754
+ return loss, num_labels
755
+
756
+ grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
757
+ (loss, num_labels), grad = grad_fn(state.params)
758
+ num_labels = jax.lax.psum(num_labels, "batch")
759
+
760
+ # true loss = total loss / total samples
761
+ loss = jax.lax.psum(loss, "batch")
762
+ loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
763
+
764
+ # true grad = total grad / total samples
765
+ grad = jax.lax.psum(grad, "batch")
766
+ grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
767
+ new_state = state.apply_gradients(grads=grad)
768
+
769
+ metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
770
+
771
+ return new_state, metrics, new_dropout_rng
772
+
773
+ # Create parallel version of the train step
774
+ p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
775
+
776
+ # Define eval fn
777
+ def eval_step(params, batch):
778
+ labels = batch.pop("labels")
779
+
780
+ logits = model(**batch, params=params, train=False)[0]
781
+
782
+ # compute loss, ignore padded input tokens
783
+ label_mask = jnp.where(labels > 0, 1.0, 0.0)
784
+ loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
785
+
786
+ # compute accuracy
787
+ accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
788
+
789
+ # summarize metrics
790
+ metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
791
+ metrics = jax.lax.psum(metrics, axis_name="batch")
792
+
793
+ return metrics
794
+
795
+ p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
796
+
797
+ # Replicate the train state on each device
798
+ state = jax_utils.replicate(state)
799
+
800
+ train_time = 0
801
+ epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
802
+ for epoch in epochs:
803
+ # ======================== Training ================================
804
+ train_start = time.time()
805
+ train_metrics = []
806
+
807
+ # Create sampling rng
808
+ rng, input_rng = jax.random.split(rng)
809
+
810
+ # Generate an epoch by shuffling sampling indices from the train dataset
811
+ num_train_samples = len(tokenized_datasets["train"])
812
+ # Avoid using jax.numpy here in case of TPU training
813
+ train_samples_idx = np.random.permutation(np.arange(num_train_samples))
814
+ train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
815
+
816
+ # Gather the indexes for creating the batch and do a training step
817
+ for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
818
+ samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
819
+ model_inputs = data_collator(samples, pad_to_multiple_of=16)
820
+
821
+ # Model forward
822
+ model_inputs = shard(model_inputs.data)
823
+ state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
824
+ train_metrics.append(train_metric)
825
+
826
+ cur_step = epoch * (num_train_samples // train_batch_size) + step
827
+
828
+ if cur_step % training_args.logging_steps == 0 and cur_step > 0:
829
+ # Save metrics
830
+ train_metric = jax_utils.unreplicate(train_metric)
831
+ train_time += time.time() - train_start
832
+ if has_tensorboard and jax.process_index() == 0:
833
+ write_train_metric(summary_writer, train_metrics, train_time, cur_step)
834
+
835
+ epochs.write(
836
+ f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate:"
837
+ f" {train_metric['learning_rate']})"
838
+ )
839
+
840
+ train_metrics = []
841
+
842
+ if cur_step % training_args.eval_steps == 0 and cur_step > 0:
843
+ # ======================== Evaluating ==============================
844
+ num_eval_samples = len(tokenized_datasets["validation"])
845
+ # Avoid using jax.numpy here in case of TPU training
846
+ eval_samples_idx = np.arange(num_eval_samples)
847
+ eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False)
848
+
849
+ eval_metrics = []
850
+ for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
851
+ samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
852
+ model_inputs = data_collator(samples, pad_to_multiple_of=16)
853
+
854
+ # Model forward
855
+ metrics = pad_shard_unpad(p_eval_step, static_return=True)(
856
+ state.params, model_inputs.data, min_device_batch=per_device_eval_batch_size
857
+ )
858
+ eval_metrics.append(metrics)
859
+
860
+ # normalize eval metrics
861
+ eval_metrics = get_metrics(eval_metrics)
862
+ eval_metrics = jax.tree_util.tree_map(jnp.sum, eval_metrics)
863
+ eval_normalizer = eval_metrics.pop("normalizer")
864
+ eval_metrics = jax.tree_util.tree_map(lambda x: x / eval_normalizer, eval_metrics)
865
+
866
+ # Update progress bar
867
+ epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
868
+
869
+ # Save metrics
870
+ if has_tensorboard and jax.process_index() == 0:
871
+ write_eval_metric(summary_writer, eval_metrics, cur_step)
872
+
873
+ if cur_step % training_args.save_steps == 0 and cur_step > 0:
874
+ # save checkpoint after each epoch and push checkpoint to the hub
875
+ if jax.process_index() == 0:
876
+ params = jax.device_get(jax.tree_util.tree_map(lambda x: x[0], state.params))
877
+ model.save_pretrained(training_args.output_dir, params=params)
878
+ tokenizer.save_pretrained(training_args.output_dir)
879
+ if training_args.push_to_hub:
880
+ api.upload_folder(
881
+ commit_message=f"Saving weights and logs of step {cur_step}",
882
+ folder_path=training_args.output_dir,
883
+ repo_id=repo_id,
884
+ repo_type="model",
885
+ token=training_args.hub_token,
886
+ )
887
+ # Eval after training
888
+ if training_args.do_eval:
889
+ num_eval_samples = len(tokenized_datasets["validation"])
890
+ # Avoid using jax.numpy here in case of TPU training
891
+ eval_samples_idx = np.arange(num_eval_samples)
892
+ eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False)
893
+
894
+ eval_metrics = []
895
+ for _, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
896
+ samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
897
+ model_inputs = data_collator(samples, pad_to_multiple_of=16)
898
+
899
+ # Model forward
900
+ metrics = pad_shard_unpad(p_eval_step, static_return=True)(
901
+ state.params, model_inputs.data, min_device_batch=per_device_eval_batch_size
902
+ )
903
+ eval_metrics.append(metrics)
904
+
905
+ # normalize eval metrics
906
+ eval_metrics = get_metrics(eval_metrics)
907
+ eval_metrics = jax.tree_util.tree_map(lambda metric: jnp.sum(metric).item(), eval_metrics)
908
+ eval_normalizer = eval_metrics.pop("normalizer")
909
+ eval_metrics = jax.tree_util.tree_map(lambda x: x / eval_normalizer, eval_metrics)
910
+
911
+ try:
912
+ perplexity = math.exp(eval_metrics["loss"])
913
+ except OverflowError:
914
+ perplexity = float("inf")
915
+ eval_metrics["perplexity"] = perplexity
916
+
917
+ if jax.process_index() == 0:
918
+ eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
919
+ path = os.path.join(training_args.output_dir, "eval_results.json")
920
+ with open(path, "w") as f:
921
+ json.dump(eval_metrics, f, indent=4, sort_keys=True)
922
+
923
+
924
+ if __name__ == "__main__":
925
+ main()
run_train.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_mlm_flax.py \
2
+ --output_dir="./" \
3
+ --model_type="roberta" \
4
+ --hub_model_id="norwegian-roberta_base" \
5
+ --config_name="./" \
6
+ --tokenizer_name="./" \
7
+ --dataset_name="oscar" \
8
+ --dataset_config_name="unshuffled_deduplicated_no" \
9
+ --max_seq_length="128" \
10
+ --weight_decay="0.01" \
11
+ --per_device_train_batch_size="2" \
12
+ --per_device_eval_batch_size="2" \
13
+ --learning_rate="3e-4" \
14
+ --warmup_steps="1000" \
15
+ --overwrite_output_dir \
16
+ --num_train_epochs="18" \
17
+ --adam_beta1="0.9" \
18
+ --adam_beta2="0.98" \
19
+ --logging_steps="500" \
20
+ --save_steps="2500" \
21
+ --eval_steps="2500" \
22
+ --push_to_hub
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
train_tokenizer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
+
4
+ # load dataset
5
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train")
6
+
7
+ # Instantiate tokenizer
8
+ tokenizer = ByteLevelBPETokenizer()
9
+
10
+ def batch_iterator(batch_size=1000):
11
+ for i in range(0, len(dataset), batch_size):
12
+ yield dataset[i: i + batch_size]["text"]
13
+
14
+ # Customized training
15
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
16
+ "<s>",
17
+ "<pad>",
18
+ "</s>",
19
+ "<unk>",
20
+ "<mask>",
21
+ ])
22
+
23
+ # Save files to disk
24
+ tokenizer.save("./tokenizer.json")
vocab.json ADDED
The diff for this file is too large to render. See raw diff