File size: 3,048 Bytes
55d61a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
from datasets import load_dataset, load_metric
import numpy as np
from transformers import AutoAdapterModel, AutoTokenizer, TrainingArguments, Trainer
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access environment variables using os.getenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
WAND_API_KEY = os.getenv("WAND_API_KEY")

# Use these variables as needed in your code


# Load datasets
dataset_pentesting = load_dataset("canstralian/pentesting-ai")
dataset_redpajama = load_dataset("togethercomputer/RedPajama-Data-1T")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("canstralian/rabbitredeux")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Tokenize datasets
tokenized_dataset_pentesting = dataset_pentesting.map(tokenize_function, batched=True)
tokenized_dataset_redpajama = dataset_redpajama.map(tokenize_function, batched=True)

# Prepare datasets
train_dataset_pentesting = tokenized_dataset_pentesting["train"]
validation_dataset_pentesting = tokenized_dataset_pentesting["validation"]

# Load model and adapter
model = AutoAdapterModel.from_pretrained("canstralian/rabbitredeux")
model.load_adapter("Canstralian/RabbitRedux", set_active=True)

# Load metric (accuracy)
metric = load_metric("accuracy")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_pentesting,
    eval_dataset=validation_dataset_pentesting,
    compute_metrics=lambda p: metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
)

# Training
trainer.train()

# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation Results: ", eval_results)

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")

# Test model on new data
new_data = """
I love the ocean. It is so peaceful and serene.
"""

# Tokenize new data
tokenized_new_data = tokenize_function({"text": [new_data]})
input_ids = tokenized_new_data["input_ids"][0]
attention_mask = tokenized_new_data["attention_mask"][0]

# Prediction
outputs = model(input_ids=np.array([input_ids]), attention_mask=np.array([attention_mask]))
prediction_scores = outputs.logits[0]  # Getting logits for the first sample

# Get predicted label
predicted_label = np.argmax(prediction_scores)

print(f"The predicted label is: {predicted_label}")

# Evaluate predictions (using some assumed correct label)
actual_label = 1  # Replace with the actual label if known

accuracy = metric.compute(predictions=[predicted_label], references=[actual_label])

print(f"Accuracy on new data: {accuracy}")