import os import sys from datasets import load_dataset, concatenate_datasets from transformers import PreTrainedTokenizerFast import transformers from transformers import ( AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, default_data_collator, GPT2Tokenizer ) from transformers.trainer_utils import get_last_checkpoint from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel from transformers import GPT2Model from transformers import GPT2TokenizerFast import transformers import torch import numpy as np import argparse tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc") tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) out_dir = "/out_dir/xed" max_length = 1024 fi_annotated_raw = load_dataset("xed_en_fi","fi_annotated") fi_neutral_raw = load_dataset("xed_en_fi","fi_neutral") def to_arr(examples): labels = [] for item in examples["labels"]: labels.append([item]) return {"sentence":examples["sentence"],"labels":labels} fi_neutral_mapped = fi_neutral_raw["train"].map(to_arr, batched=True) fi_neutral_mapped_cast = fi_neutral_mapped.cast(fi_annotated_raw["train"].features) concat_raw_set = concatenate_datasets([fi_neutral_mapped_cast, fi_annotated_raw["train"]])#combine neutral and other labels into single dataset def tokenize_function(examples): return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=max_length) def to_arr_2(examples): labels = [] for item in examples["labels"]: label = np.zeros(9) label[item] = 1 labels.append(label.tolist()) return {"sentence":examples["sentence"],"labels":labels} tokenized_datasets = concat_raw_set.map(tokenize_function, batched=True).map(to_arr_2, batched=True).shuffle(seed=42).train_test_split(test_size=0.1) tokenized_datasets.save_to_disk(out_dir)