|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. |
|
|
|
Here is the full list of checkpoints on the hub that can be fine-tuned by this script: |
|
https://huggingface.co/models?filter=text-generation |
|
""" |
|
|
|
|
|
import logging |
|
import math |
|
import os |
|
|
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' |
|
import sys |
|
from dataclasses import dataclass, field |
|
from itertools import chain |
|
from typing import Optional |
|
|
|
import datasets |
|
import evaluate |
|
import torch |
|
from datasets import load_dataset |
|
|
|
import transformers |
|
from transformers import ( |
|
CONFIG_MAPPING, |
|
MODEL_FOR_CAUSAL_LM_MAPPING, |
|
AutoConfig, |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
HfArgumentParser, |
|
Trainer, |
|
TrainingArguments, |
|
default_data_collator, |
|
is_torch_tpu_available, |
|
set_seed, |
|
) |
|
from transformers.testing_utils import CaptureLogger |
|
from transformers.trainer_utils import get_last_checkpoint |
|
from transformers.utils import check_min_version, send_example_telemetry |
|
from transformers.utils.versions import require_version |
|
|
|
from transformers import AutoModel, AutoTokenizer |
|
from datasets import load_dataset |
|
from transformers.testing_utils import CaptureLogger |
|
|
|
from itertools import chain |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def get_score(submission_folder = "../env"): |
|
training_args = TrainingArguments("test_trainer") |
|
training_args.report_to = [] |
|
raw_datasets = load_dataset(submission_folder + "/babyLM_for_hf.py", "babyLM-10M", split="test") |
|
model = AutoModelForCausalLM.from_pretrained(submission_folder + "/output/") |
|
tokenizer = AutoTokenizer.from_pretrained(submission_folder + "/output/") |
|
|
|
|
|
|
|
column_names = list(raw_datasets.features) |
|
text_column_name = "text" if "text" in column_names else column_names[0] |
|
|
|
|
|
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") |
|
|
|
def tokenize_function(examples): |
|
with CaptureLogger(tok_logger) as cl: |
|
output = tokenizer(examples[text_column_name]) |
|
|
|
if "Token indices sequence length is longer than the" in cl.out: |
|
tok_logger.warning( |
|
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" |
|
" before being passed to the model." |
|
) |
|
return output |
|
|
|
with training_args.main_process_first(desc="dataset map tokenization"): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenized_datasets = raw_datasets.map( |
|
tokenize_function, |
|
batched=True, |
|
remove_columns=column_names, |
|
) |
|
|
|
if True: |
|
block_size = tokenizer.model_max_length |
|
if block_size > 1024: |
|
logger.warning( |
|
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" |
|
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" |
|
" override this default with `--block_size xxx`." |
|
) |
|
block_size = 1024 |
|
else: |
|
if data_args.block_size > tokenizer.model_max_length: |
|
logger.warning( |
|
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" |
|
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." |
|
) |
|
block_size = min(data_args.block_size, tokenizer.model_max_length) |
|
|
|
|
|
def group_texts(examples): |
|
|
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} |
|
total_length = len(concatenated_examples[list(examples.keys())[0]]) |
|
|
|
|
|
total_length = (total_length // block_size) * block_size |
|
|
|
result = { |
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
|
for k, t in concatenated_examples.items() |
|
} |
|
result["labels"] = result["input_ids"].copy() |
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with training_args.main_process_first(desc="grouping texts together"): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lm_datasets = tokenized_datasets.map( |
|
group_texts, |
|
batched=True, |
|
) |
|
eval_dataset = lm_datasets |
|
|
|
def preprocess_logits_for_metrics(logits, labels): |
|
if isinstance(logits, tuple): |
|
|
|
|
|
logits = logits[0] |
|
return logits.argmax(dim=-1) |
|
|
|
metric = evaluate.load("accuracy") |
|
|
|
def compute_metrics(eval_preds): |
|
preds, labels = eval_preds |
|
|
|
|
|
labels = labels[:, 1:].reshape(-1) |
|
preds = preds[:, :-1].reshape(-1) |
|
return metric.compute(predictions=preds, references=labels) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=None, |
|
eval_dataset=eval_dataset, |
|
tokenizer=tokenizer, |
|
|
|
data_collator=default_data_collator, |
|
compute_metrics=compute_metrics, |
|
preprocess_logits_for_metrics=preprocess_logits_for_metrics, |
|
) |
|
|
|
transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING) |
|
|
|
|
|
metrics = trainer.evaluate() |
|
|
|
max_eval_samples = len(eval_dataset) |
|
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) |
|
try: |
|
perplexity = math.exp(metrics["eval_loss"]) |
|
except OverflowError: |
|
perplexity = float("inf") |
|
metrics["perplexity"] = perplexity |
|
|
|
return perplexity |
|
|
|
if __name__ == "__main__": |
|
print(get_score()) |