Create gpt
Browse files
gpt
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModel, AutoTokenizer
|
2 |
+
from datasets import load_dataset
|
3 |
+
from torch.utils.data import DataLoader, Dataset
|
4 |
+
import torch.optim as optim
|
5 |
+
import torch.nn as nn
|
6 |
+
|
7 |
+
class ShellcodeDataset(Dataset):
|
8 |
+
def __init__(self, data, tokenizer):
|
9 |
+
self.data = data
|
10 |
+
self.tokenizer = tokenizer
|
11 |
+
|
12 |
+
def __len__(self):
|
13 |
+
return len(self.data)
|
14 |
+
|
15 |
+
def __getitem__(self, idx):
|
16 |
+
intent = self.data[idx]['intent']
|
17 |
+
snippet = self.data[idx]['snippet']
|
18 |
+
encoding = self.tokenizer(intent, return_tensors="pt", padding="max_length", truncation=True, max_length=1024)
|
19 |
+
label = self.tokenizer(snippet, return_tensors="pt", padding="max_length", truncation=True, max_length=1024)
|
20 |
+
return {'input_ids': encoding['input_ids'], 'labels': label['input_ids']}
|
21 |
+
|
22 |
+
# Initialize tokenizer and model
|
23 |
+
model_name = "openai-community/gpt2"
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
25 |
+
model = AutoModel.from_pretrained(model_name)
|
26 |
+
|
27 |
+
# Add padding token to the tokenizer
|
28 |
+
tokenizer.pad_token = tokenizer.eos_token
|
29 |
+
|
30 |
+
# Load the dataset
|
31 |
+
dataset = load_dataset('SoLID/shellcode_i_a32')
|
32 |
+
|
33 |
+
# Create the dataset and dataloader
|
34 |
+
train_dataset = ShellcodeDataset(dataset['train'], tokenizer)
|
35 |
+
train_dataloader = DataLoader(train_dataset, batch_size=16)
|
36 |
+
|
37 |
+
# Define the optimizer and criterion
|
38 |
+
optimizer = optim.Adam(model.parameters())
|
39 |
+
criterion = nn.CrossEntropyLoss()
|
40 |
+
|
41 |
+
# Training loop
|
42 |
+
model.train()
|
43 |
+
for epoch in range(3):
|
44 |
+
for batch in train_dataloader:
|
45 |
+
optimizer.zero_grad()
|
46 |
+
input_ids, labels = batch['input_ids'], batch['labels']
|
47 |
+
outputs = model(input_ids)
|
48 |
+
loss = criterion(outputs.logits, labels)
|
49 |
+
loss.backward()
|
50 |
+
optimizer.step()
|