maxdunhill commited on
Commit
93d8d6c
1 Parent(s): 94f7497

Upload classifier.py

Browse files

My best attempt at modifying:
https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb#scrollTo=kT5-oqMPB6vp

For the purpose of detecting vulnerable/non vulnerable code. Functionality all the way up to Validation section works.

When trying to run Validation section, I got error message: "TypeError: new(): invalid data type 'str'"

Making file available, should a member of the community wish to take it upon themselves to get the validation functionality to run.

Files changed (1) hide show
  1. classifier.py +185 -0
classifier.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.simplefilter('ignore')
3
+ import numpy as np
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+ from sklearn import metrics
7
+ import transformers
8
+ import torch
9
+ from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
10
+ from transformers import DistilBertTokenizer, DistilBertModel
11
+ import logging
12
+ logging.basicConfig(level=logging.ERROR)
13
+
14
+ # # Setting up the device for GPU usage
15
+
16
+ from torch import cuda
17
+ device = 'cuda' if cuda.is_available() else 'cpu'
18
+
19
+ def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
20
+ acc_list = []
21
+ for i in range(y_true.shape[0]):
22
+ set_true = set( np.where(y_true[i])[0] )
23
+ set_pred = set( np.where(y_pred[i])[0] )
24
+ tmp_a = None
25
+ if len(set_true) == 0 and len(set_pred) == 0:
26
+ tmp_a = 1
27
+ else:
28
+ tmp_a = len(set_true.intersection(set_pred))/\
29
+ float( len(set_true.union(set_pred)) )
30
+ acc_list.append(tmp_a)
31
+ return np.mean(acc_list)
32
+
33
+ data = pd.read_csv('Vulnerable code dataset 15_12_22 - Training.csv')
34
+ #data.drop(['source_name'], inplace=True, axis=1)
35
+ new_df = pd.DataFrame()
36
+ new_df['text'] = data['text']
37
+ new_df['labels'] = data['label']
38
+ new_df.head()
39
+
40
+
41
+ # Sections of config
42
+
43
+ # Defining some key variables that will be used later on in the training
44
+ MAX_LEN = 128
45
+ TRAIN_BATCH_SIZE = 4
46
+ VALID_BATCH_SIZE = 4
47
+ EPOCHS = 1
48
+ LEARNING_RATE = 1e-05
49
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
50
+
51
+ class MultiLabelDataset(Dataset):
52
+
53
+ def __init__(self, dataframe, tokenizer, max_len):
54
+ self.tokenizer = tokenizer
55
+ self.data = dataframe
56
+ self.text = dataframe.text
57
+ self.targets = self.data.labels
58
+ self.max_len = max_len
59
+
60
+ def __len__(self):
61
+ return len(self.text)
62
+
63
+ def __getitem__(self, index):
64
+ text = str(self.text[index])
65
+ text = " ".join(text.split())
66
+
67
+ inputs = self.tokenizer.encode_plus(
68
+ text,
69
+ None,
70
+ add_special_tokens=True,
71
+ max_length=self.max_len,
72
+ pad_to_max_length=True,
73
+ return_token_type_ids=True
74
+ )
75
+ ids = inputs['input_ids']
76
+ mask = inputs['attention_mask']
77
+ token_type_ids = inputs["token_type_ids"]
78
+
79
+
80
+ return {
81
+ 'ids': torch.tensor(ids, dtype=torch.long),
82
+ 'mask': torch.tensor(mask, dtype=torch.long),
83
+ 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
84
+ 'targets': torch.tensor(self.targets[index], dtype=torch.float)
85
+ }
86
+
87
+ train_size = 0.8
88
+ train_data=new_df.sample(frac=train_size,random_state=200)
89
+ test_data=new_df.drop(train_data.index).reset_index(drop=True)
90
+ train_data = train_data.reset_index(drop=True)
91
+
92
+
93
+ print("FULL Dataset: {}".format(new_df.shape))
94
+ print("TRAIN Dataset: {}".format(train_data.shape))
95
+ print("TEST Dataset: {}".format(test_data.shape))
96
+
97
+ training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
98
+ testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
99
+
100
+ train_params = {'batch_size': TRAIN_BATCH_SIZE,
101
+ 'shuffle': True,
102
+ 'num_workers': 0
103
+ }
104
+
105
+ test_params = {'batch_size': VALID_BATCH_SIZE,
106
+ 'shuffle': True,
107
+ 'num_workers': 0
108
+ }
109
+
110
+ training_loader = DataLoader(training_set, **train_params)
111
+ testing_loader = DataLoader(testing_set, **test_params)
112
+
113
+ # Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
114
+
115
+ class DistilBERTClass(torch.nn.Module):
116
+ def __init__(self):
117
+ super(DistilBERTClass, self).__init__()
118
+ self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
119
+ self.pre_classifier = torch.nn.Linear(768, 768)
120
+ self.dropout = torch.nn.Dropout(0.1)
121
+ self.classifier = torch.nn.Linear(768, 6)
122
+
123
+ def forward(self, input_ids, attention_mask, token_type_ids):
124
+ output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
125
+ hidden_state = output_1[0]
126
+ pooler = hidden_state[:, 0]
127
+ pooler = self.pre_classifier(pooler)
128
+ pooler = torch.nn.Tanh()(pooler)
129
+ pooler = self.dropout(pooler)
130
+ output = self.classifier(pooler)
131
+ return output
132
+
133
+ model = DistilBERTClass()
134
+ model.to(device)
135
+
136
+ def loss_fn(outputs, targets):
137
+ return torch.nn.BCEWithLogitsLoss()(outputs, targets)
138
+
139
+ optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
140
+
141
+ def train(epoch):
142
+ model.train()
143
+ for _,data in tqdm(enumerate(training_loader, 0)):
144
+ ids = data['ids'].to(device, dtype = torch.long)
145
+ mask = data['mask'].to(device, dtype = torch.long)
146
+ token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
147
+ targets = data['targets'].to(device, dtype = torch.float)
148
+
149
+ outputs = model(ids, mask, token_type_ids)
150
+
151
+ optimizer.zero_grad()
152
+ loss = loss_fn(outputs, targets)
153
+ if _%5000==0:
154
+ print(f'Epoch: {epoch}, Loss: {loss.item()}')
155
+
156
+ loss.backward()
157
+ optimizer.step()
158
+
159
+ for epoch in range(EPOCHS):
160
+ train(epoch)
161
+
162
+ def validation(testing_loader):
163
+ model.eval()
164
+ fin_targets=[]
165
+ fin_outputs=[]
166
+ with torch.no_grad():
167
+ for _, data in tqdm(enumerate(testing_loader, 0)):
168
+ ids = data['ids'].to(device, dtype = torch.long)
169
+ mask = data['mask'].to(device, dtype = torch.long)
170
+ token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
171
+ targets = data['targets'].to(device, dtype = torch.float)
172
+ outputs = model(ids, mask, token_type_ids)
173
+ fin_targets.extend(targets.cpu().detach().numpy().tolist())
174
+ fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
175
+ return fin_outputs, fin_targets
176
+
177
+ outputs, targets = validation(testing_loader)
178
+
179
+ final_outputs = np.array(outputs) >=0.5
180
+
181
+ val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
182
+ val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))
183
+
184
+ print(f"Hamming Score = {val_hamming_score}")
185
+ print(f"Hamming Loss = {val_hamming_loss}")