Spaces:

Tort73
/

new-test-autoresearch

Paused

new-test-autoresearch / benchmarks /babylm /scripts /eval.py

Teerth Patel

initial commit

199a42f 3 months ago

8.57 kB

	#!/usr/bin/env python
	# coding=utf-8
	# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.

	Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
	https://huggingface.co/models?filter=text-generation
	"""
	# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.

	import logging
	import math
	import os
	# disable logging until training starts
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
	import sys
	from dataclasses import dataclass, field
	from itertools import chain
	from typing import Optional

	import datasets
	import evaluate
	import torch
	from datasets import load_dataset

	import transformers
	from transformers import (
	CONFIG_MAPPING,
	MODEL_FOR_CAUSAL_LM_MAPPING,
	AutoConfig,
	AutoModelForCausalLM,
	AutoTokenizer,
	HfArgumentParser,
	Trainer,
	TrainingArguments,
	default_data_collator,
	is_torch_tpu_available,
	set_seed,
	)
	from transformers.testing_utils import CaptureLogger
	from transformers.trainer_utils import get_last_checkpoint
	from transformers.utils import check_min_version, send_example_telemetry
	from transformers.utils.versions import require_version

	from transformers import AutoModel, AutoTokenizer
	from datasets import load_dataset
	from transformers.testing_utils import CaptureLogger

	from itertools import chain

	logger = logging.getLogger(__name__)


	def get_score(submission_folder = "../env"):
	training_args = TrainingArguments("test_trainer")
	training_args.report_to = []
	raw_datasets = load_dataset(submission_folder + "/babyLM_for_hf.py", "babyLM-10M", split="test")
	model = AutoModelForCausalLM.from_pretrained(submission_folder + "/output/")
	tokenizer = AutoTokenizer.from_pretrained(submission_folder + "/output/")

	# Preprocessing the datasets.
	# First we tokenize all the texts.
	column_names = list(raw_datasets.features)
	text_column_name = "text" if "text" in column_names else column_names[0]

	# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
	tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

	def tokenize_function(examples):
	with CaptureLogger(tok_logger) as cl:
	output = tokenizer(examples[text_column_name])
	# clm input could be much much longer than block_size
	if "Token indices sequence length is longer than the" in cl.out:
	tok_logger.warning(
	"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
	" before being passed to the model."
	)
	return output

	with training_args.main_process_first(desc="dataset map tokenization"):
	# if not data_args.streaming:
	# tokenized_datasets = raw_datasets.map(
	# tokenize_function,
	# batched=True,
	# num_proc=data_args.preprocessing_num_workers,
	# remove_columns=column_names,
	# load_from_cache_file=not data_args.overwrite_cache,
	# desc="Running tokenizer on dataset",
	# )
	# else:
	tokenized_datasets = raw_datasets.map(
	tokenize_function,
	batched=True,
	remove_columns=column_names,
	)

	if True:
	block_size = tokenizer.model_max_length
	if block_size > 1024:
	logger.warning(
	"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
	" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
	" override this default with `--block_size xxx`."
	)
	block_size = 1024
	else:
	if data_args.block_size > tokenizer.model_max_length:
	logger.warning(
	f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
	f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
	)
	block_size = min(data_args.block_size, tokenizer.model_max_length)

	# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
	def group_texts(examples):
	# Concatenate all texts.
	concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
	total_length = len(concatenated_examples[list(examples.keys())[0]])
	# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
	# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
	total_length = (total_length // block_size) * block_size
	# Split by chunks of max_len.
	result = {
	k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
	for k, t in concatenated_examples.items()
	}
	result["labels"] = result["input_ids"].copy()
	return result

	# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
	# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
	# to preprocess.
	#
	# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
	# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

	with training_args.main_process_first(desc="grouping texts together"):
	# if not data_args.streaming:
	# lm_datasets = tokenized_datasets.map(
	# group_texts,
	# batched=True,
	# num_proc=data_args.preprocessing_num_workers,
	# load_from_cache_file=not data_args.overwrite_cache,
	# desc=f"Grouping texts in chunks of {block_size}",
	# )
	# else:
	lm_datasets = tokenized_datasets.map(
	group_texts,
	batched=True,
	)
	eval_dataset = lm_datasets

	def preprocess_logits_for_metrics(logits, labels):
	if isinstance(logits, tuple):
	# Depending on the model and config, logits may contain extra tensors,
	# like past_key_values, but logits always come first
	logits = logits[0]
	return logits.argmax(dim=-1)

	metric = evaluate.load("accuracy")

	def compute_metrics(eval_preds):
	preds, labels = eval_preds
	# preds have the same shape as the labels, after the argmax(-1) has been calculated
	# by preprocess_logits_for_metrics but we need to shift the labels
	labels = labels[:, 1:].reshape(-1)
	preds = preds[:, :-1].reshape(-1)
	return metric.compute(predictions=preds, references=labels)

	# Initialize our Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=None,
	eval_dataset=eval_dataset,
	tokenizer=tokenizer,
	# Data collator will default to DataCollatorWithPadding, so we change it.
	data_collator=default_data_collator,
	compute_metrics=compute_metrics,
	preprocess_logits_for_metrics=preprocess_logits_for_metrics,
	)

	transformers.utils.logging.set_verbosity(transformers.utils.logging.WARNING)

	# Evaluation
	metrics = trainer.evaluate()

	max_eval_samples = len(eval_dataset)
	metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
	try:
	perplexity = math.exp(metrics["eval_loss"])
	except OverflowError:
	perplexity = float("inf")
	metrics["perplexity"] = perplexity

	return perplexity

	if __name__ == "__main__":
	print(get_score())