Teerth Patel
initial commit
199a42f
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
import logging
import math
import os
# disable logging until training starts
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
import evaluate
import torch
from datasets import load_dataset
import transformers
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
HfArgumentParser,
Trainer,
TrainingArguments,
default_data_collator,
is_torch_tpu_available,
set_seed,
)
from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from transformers.testing_utils import CaptureLogger
from itertools import chain
logger = logging.getLogger(__name__)
def get_score(submission_folder = "../env"):
training_args = TrainingArguments("test_trainer")
training_args.report_to = []
raw_datasets = load_dataset(submission_folder + "/babyLM_for_hf.py", "babyLM-10M", split="test")
model = AutoModelForCausalLM.from_pretrained(submission_folder + "/output/")
tokenizer = AutoTokenizer.from_pretrained(submission_folder + "/output/")
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = list(raw_datasets.features)
text_column_name = "text" if "text" in column_names else column_names[0]
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
def tokenize_function(examples):
with CaptureLogger(tok_logger) as cl:
output = tokenizer(examples[text_column_name])
# clm input could be much much longer than block_size
if "Token indices sequence length is longer than the" in cl.out:
tok_logger.warning(
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
" before being passed to the model."
)
return output
with training_args.main_process_first(desc="dataset map tokenization"):
# if not data_args.streaming:
# tokenized_datasets = raw_datasets.map(
# tokenize_function,
# batched=True,
# num_proc=data_args.preprocessing_num_workers,
# remove_columns=column_names,
# load_from_cache_file=not data_args.overwrite_cache,
# desc="Running tokenizer on dataset",
# )
# else:
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
remove_columns=column_names,
)
if True:
block_size = tokenizer.model_max_length
if block_size > 1024:
logger.warning(
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
" override this default with `--block_size xxx`."
)
block_size = 1024
else:
if data_args.block_size > tokenizer.model_max_length:
logger.warning(
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
)
block_size = min(data_args.block_size, tokenizer.model_max_length)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
# if not data_args.streaming:
# lm_datasets = tokenized_datasets.map(
# group_texts,
# batched=True,
# num_proc=data_args.preprocessing_num_workers,
# load_from_cache_file=not data_args.overwrite_cache,
# desc=f"Grouping texts in chunks of {block_size}",
# )
# else:
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
)
eval_dataset = lm_datasets
def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
# Depending on the model and config, logits may contain extra tensors,
# like past_key_values, but logits always come first
logits = logits[0]
return logits.argmax(dim=-1)
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
preds, labels = eval_preds
# preds have the same shape as the labels, after the argmax(-1) has been calculated
# by preprocess_logits_for_metrics but we need to shift the labels
labels = labels[:, 1:].reshape(-1)
preds = preds[:, :-1].reshape(-1)
return metric.compute(predictions=preds, references=labels)
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=None,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
# Data collator will default to DataCollatorWithPadding, so we change it.
data_collator=default_data_collator,
compute_metrics=compute_metrics,
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)
transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING)
# Evaluation
metrics = trainer.evaluate()
max_eval_samples = len(eval_dataset)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
try:
perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
perplexity = float("inf")
metrics["perplexity"] = perplexity
return perplexity
if __name__ == "__main__":
print(get_score())