Spaces:

Justinrune
/

LLaMA-Factory

Running

App Files Files Community

LLaMA-Factory / scripts /length_cdf.py

Justinrune

Upload folder using huggingface_hub

2852136 verified 6 months ago

raw

history blame

2.25 kB

	# coding=utf-8
	# Copyright 2024 the LlamaFactory team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from collections import defaultdict

	import fire
	from tqdm import tqdm

	from llamafactory.data import get_dataset
	from llamafactory.hparams import get_train_args
	from llamafactory.model import load_tokenizer


	def length_cdf(
	model_name_or_path: str,
	dataset: str = "alpaca_en",
	dataset_dir: str = "data",
	template: str = "default",
	interval: int = 1000,
	):
	r"""
	Calculates the distribution of the input lengths in the dataset.
	Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
	"""
	model_args, data_args, training_args, _, _ = get_train_args(
	dict(
	stage="sft",
	model_name_or_path=model_name_or_path,
	dataset=dataset,
	dataset_dir=dataset_dir,
	template=template,
	cutoff_len=1_000_000,
	output_dir="dummy_dir",
	overwrite_cache=True,
	)
	)
	tokenizer_module = load_tokenizer(model_args)
	trainset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
	total_num = len(trainset)
	length_dict = defaultdict(int)
	for sample in tqdm(trainset["input_ids"]):
	length_dict[len(sample) // interval * interval] += 1

	length_tuples = list(length_dict.items())
	length_tuples.sort()
	count_accu, prob_accu = 0, 0
	for length, count in length_tuples:
	count_accu += count
	prob_accu += count / total_num * 100
	print("{:d} ({:.2f}%) samples have length < {}.".format(count_accu, prob_accu, length + interval))


	if __name__ == "__main__":
	fire.Fire(length_cdf)