OMG-LLaVA / xtuner /tools /process_untokenized_llava_data.py
zhangtao-whu's picture
Upload folder using huggingface_hub
476ac07 verified
raw
history blame
841 Bytes
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import warnings
from mmengine import Config
from xtuner.registry import BUILDER
# ignore FutureWarning in hf datasets
warnings.simplefilter(action='ignore', category=FutureWarning)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('config', help='config file name or path.')
parser.add_argument('--save-folder', help='The folder to save data order.')
args = parser.parse_args()
return args
def build_llava_dataset(config):
dataset = BUILDER.build(config.train_dataloader.dataset)
return dataset
if __name__ == '__main__':
args = parse_args()
cfg = Config.fromfile(args.config)
llava_dataset = build_llava_dataset(cfg)
text_data = llava_dataset.text_data
text_data.save_to_disk(args.save_folder)