from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer from datasets import load_dataset import torch # Load datasets def load_train_data(): # Example dataset train_dataset = load_dataset('csv', data_files={"train": "datasets/Canstralian/ShellCommands.csv"}) return train_dataset # Load model and tokenizer def load_model_and_tokenizer(model_name): model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Adjust labels tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer # Preprocessing function def preprocess_function(examples, tokenizer): return tokenizer(examples['text'], padding=True, truncation=True) # Fine-tuning function def fine_tune(model_name="WhiteRabbitNeo/WhiteRabbitNeo-13B-v1"): train_data = load_train_data() model, tokenizer = load_model_and_tokenizer(model_name) # Tokenize the dataset train_data = train_data.map(lambda x: preprocess_function(x, tokenizer), batched=True) train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) # Training arguments training_args = TrainingArguments( output_dir='./results', evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, num_train_epochs=3, weight_decay=0.01, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_data['train'], ) trainer.train() # Call fine-tuning fine_tune()