Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# coding=utf-8 | |
# Copyright 2020 The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" | |
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. | |
Here is the full list of checkpoints on the hub that can be fine-tuned by this script: | |
https://huggingface.co/models?filter=text-generation | |
""" | |
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. | |
import logging | |
import math | |
import os | |
# disable logging until training starts | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
import sys | |
from dataclasses import dataclass, field | |
from itertools import chain | |
from typing import Optional | |
import datasets | |
import evaluate | |
import torch | |
from datasets import load_dataset | |
import transformers | |
from transformers import ( | |
CONFIG_MAPPING, | |
MODEL_FOR_CAUSAL_LM_MAPPING, | |
AutoConfig, | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
HfArgumentParser, | |
Trainer, | |
TrainingArguments, | |
default_data_collator, | |
is_torch_tpu_available, | |
set_seed, | |
) | |
from transformers.testing_utils import CaptureLogger | |
from transformers.trainer_utils import get_last_checkpoint | |
from transformers.utils import check_min_version, send_example_telemetry | |
from transformers.utils.versions import require_version | |
from transformers import AutoModel, AutoTokenizer | |
from datasets import load_dataset | |
from transformers.testing_utils import CaptureLogger | |
from itertools import chain | |
logger = logging.getLogger(__name__) | |
def get_score(submission_folder = "../env"): | |
training_args = TrainingArguments("test_trainer") | |
training_args.report_to = [] | |
raw_datasets = load_dataset(submission_folder + "/babyLM_for_hf.py", "babyLM-10M", split="test") | |
model = AutoModelForCausalLM.from_pretrained(submission_folder + "/output/") | |
tokenizer = AutoTokenizer.from_pretrained(submission_folder + "/output/") | |
# Preprocessing the datasets. | |
# First we tokenize all the texts. | |
column_names = list(raw_datasets.features) | |
text_column_name = "text" if "text" in column_names else column_names[0] | |
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function | |
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") | |
def tokenize_function(examples): | |
with CaptureLogger(tok_logger) as cl: | |
output = tokenizer(examples[text_column_name]) | |
# clm input could be much much longer than block_size | |
if "Token indices sequence length is longer than the" in cl.out: | |
tok_logger.warning( | |
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" | |
" before being passed to the model." | |
) | |
return output | |
with training_args.main_process_first(desc="dataset map tokenization"): | |
# if not data_args.streaming: | |
# tokenized_datasets = raw_datasets.map( | |
# tokenize_function, | |
# batched=True, | |
# num_proc=data_args.preprocessing_num_workers, | |
# remove_columns=column_names, | |
# load_from_cache_file=not data_args.overwrite_cache, | |
# desc="Running tokenizer on dataset", | |
# ) | |
# else: | |
tokenized_datasets = raw_datasets.map( | |
tokenize_function, | |
batched=True, | |
remove_columns=column_names, | |
) | |
if True: | |
block_size = tokenizer.model_max_length | |
if block_size > 1024: | |
logger.warning( | |
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" | |
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" | |
" override this default with `--block_size xxx`." | |
) | |
block_size = 1024 | |
else: | |
if data_args.block_size > tokenizer.model_max_length: | |
logger.warning( | |
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" | |
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." | |
) | |
block_size = min(data_args.block_size, tokenizer.model_max_length) | |
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. | |
def group_texts(examples): | |
# Concatenate all texts. | |
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} | |
total_length = len(concatenated_examples[list(examples.keys())[0]]) | |
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. | |
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs. | |
total_length = (total_length // block_size) * block_size | |
# Split by chunks of max_len. | |
result = { | |
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] | |
for k, t in concatenated_examples.items() | |
} | |
result["labels"] = result["input_ids"].copy() | |
return result | |
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder | |
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower | |
# to preprocess. | |
# | |
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: | |
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map | |
with training_args.main_process_first(desc="grouping texts together"): | |
# if not data_args.streaming: | |
# lm_datasets = tokenized_datasets.map( | |
# group_texts, | |
# batched=True, | |
# num_proc=data_args.preprocessing_num_workers, | |
# load_from_cache_file=not data_args.overwrite_cache, | |
# desc=f"Grouping texts in chunks of {block_size}", | |
# ) | |
# else: | |
lm_datasets = tokenized_datasets.map( | |
group_texts, | |
batched=True, | |
) | |
eval_dataset = lm_datasets | |
def preprocess_logits_for_metrics(logits, labels): | |
if isinstance(logits, tuple): | |
# Depending on the model and config, logits may contain extra tensors, | |
# like past_key_values, but logits always come first | |
logits = logits[0] | |
return logits.argmax(dim=-1) | |
metric = evaluate.load("accuracy") | |
def compute_metrics(eval_preds): | |
preds, labels = eval_preds | |
# preds have the same shape as the labels, after the argmax(-1) has been calculated | |
# by preprocess_logits_for_metrics but we need to shift the labels | |
labels = labels[:, 1:].reshape(-1) | |
preds = preds[:, :-1].reshape(-1) | |
return metric.compute(predictions=preds, references=labels) | |
# Initialize our Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=None, | |
eval_dataset=eval_dataset, | |
tokenizer=tokenizer, | |
# Data collator will default to DataCollatorWithPadding, so we change it. | |
data_collator=default_data_collator, | |
compute_metrics=compute_metrics, | |
preprocess_logits_for_metrics=preprocess_logits_for_metrics, | |
) | |
transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING) | |
# Evaluation | |
metrics = trainer.evaluate() | |
max_eval_samples = len(eval_dataset) | |
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) | |
try: | |
perplexity = math.exp(metrics["eval_loss"]) | |
except OverflowError: | |
perplexity = float("inf") | |
metrics["perplexity"] = perplexity | |
return perplexity | |
if __name__ == "__main__": | |
print(get_score()) |