reconninja-wordlists-v2 / training_and_evaluation.py
Canstralian's picture
Create training_and_evaluation.py
55d61a2 verified
raw
history blame
3.05 kB
import os
from datasets import load_dataset, load_metric
import numpy as np
from transformers import AutoAdapterModel, AutoTokenizer, TrainingArguments, Trainer
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Access environment variables using os.getenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
WAND_API_KEY = os.getenv("WAND_API_KEY")
# Use these variables as needed in your code
# Load datasets
dataset_pentesting = load_dataset("canstralian/pentesting-ai")
dataset_redpajama = load_dataset("togethercomputer/RedPajama-Data-1T")
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("canstralian/rabbitredeux")
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True)
# Tokenize datasets
tokenized_dataset_pentesting = dataset_pentesting.map(tokenize_function, batched=True)
tokenized_dataset_redpajama = dataset_redpajama.map(tokenize_function, batched=True)
# Prepare datasets
train_dataset_pentesting = tokenized_dataset_pentesting["train"]
validation_dataset_pentesting = tokenized_dataset_pentesting["validation"]
# Load model and adapter
model = AutoAdapterModel.from_pretrained("canstralian/rabbitredeux")
model.load_adapter("Canstralian/RabbitRedux", set_active=True)
# Load metric (accuracy)
metric = load_metric("accuracy")
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
evaluation_strategy="epoch"
)
# Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset_pentesting,
eval_dataset=validation_dataset_pentesting,
compute_metrics=lambda p: metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
)
# Training
trainer.train()
# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation Results: ", eval_results)
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
# Test model on new data
new_data = """
I love the ocean. It is so peaceful and serene.
"""
# Tokenize new data
tokenized_new_data = tokenize_function({"text": [new_data]})
input_ids = tokenized_new_data["input_ids"][0]
attention_mask = tokenized_new_data["attention_mask"][0]
# Prediction
outputs = model(input_ids=np.array([input_ids]), attention_mask=np.array([attention_mask]))
prediction_scores = outputs.logits[0] # Getting logits for the first sample
# Get predicted label
predicted_label = np.argmax(prediction_scores)
print(f"The predicted label is: {predicted_label}")
# Evaluate predictions (using some assumed correct label)
actual_label = 1 # Replace with the actual label if known
accuracy = metric.compute(predictions=[predicted_label], references=[actual_label])
print(f"Accuracy on new data: {accuracy}")