#!/usr/bin/env python # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. Here is the full list of checkpoints on the hub that can be fine-tuned by this script: https://huggingface.co/models?filter=text-generation """ # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. import logging import math import os # disable logging until training starts os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import sys from dataclasses import dataclass, field from itertools import chain from typing import Optional import datasets import evaluate import torch from datasets import load_dataset import transformers from transformers import ( CONFIG_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, Trainer, TrainingArguments, default_data_collator, is_torch_tpu_available, set_seed, ) from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version from transformers import AutoModel, AutoTokenizer from datasets import load_dataset from transformers.testing_utils import CaptureLogger from itertools import chain logger = logging.getLogger(__name__) def get_score(submission_folder = "../env"): training_args = TrainingArguments("test_trainer") training_args.report_to = [] raw_datasets = load_dataset(submission_folder + "/babyLM_for_hf.py", "babyLM-10M", split="test") model = AutoModelForCausalLM.from_pretrained(submission_folder + "/output/") tokenizer = AutoTokenizer.from_pretrained(submission_folder + "/output/") # Preprocessing the datasets. # First we tokenize all the texts. column_names = list(raw_datasets.features) text_column_name = "text" if "text" in column_names else column_names[0] # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") def tokenize_function(examples): with CaptureLogger(tok_logger) as cl: output = tokenizer(examples[text_column_name]) # clm input could be much much longer than block_size if "Token indices sequence length is longer than the" in cl.out: tok_logger.warning( "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" " before being passed to the model." ) return output with training_args.main_process_first(desc="dataset map tokenization"): # if not data_args.streaming: # tokenized_datasets = raw_datasets.map( # tokenize_function, # batched=True, # num_proc=data_args.preprocessing_num_workers, # remove_columns=column_names, # load_from_cache_file=not data_args.overwrite_cache, # desc="Running tokenizer on dataset", # ) # else: tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, remove_columns=column_names, ) if True: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" " override this default with `--block_size xxx`." ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): # if not data_args.streaming: # lm_datasets = tokenized_datasets.map( # group_texts, # batched=True, # num_proc=data_args.preprocessing_num_workers, # load_from_cache_file=not data_args.overwrite_cache, # desc=f"Grouping texts in chunks of {block_size}", # ) # else: lm_datasets = tokenized_datasets.map( group_texts, batched=True, ) eval_dataset = lm_datasets def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): # Depending on the model and config, logits may contain extra tensors, # like past_key_values, but logits always come first logits = logits[0] return logits.argmax(dim=-1) metric = evaluate.load("accuracy") def compute_metrics(eval_preds): preds, labels = eval_preds # preds have the same shape as the labels, after the argmax(-1) has been calculated # by preprocess_logits_for_metrics but we need to shift the labels labels = labels[:, 1:].reshape(-1) preds = preds[:, :-1].reshape(-1) return metric.compute(predictions=preds, references=labels) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=None, eval_dataset=eval_dataset, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, compute_metrics=compute_metrics, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING) # Evaluation metrics = trainer.evaluate() max_eval_samples = len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity return perplexity if __name__ == "__main__": print(get_score())