minseokKoo commited on
Commit
3945f15
โ€ข
1 Parent(s): 9317198
Files changed (1) hide show
  1. app.py +199 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ import os
5
+ import sys
6
+ import random
7
+ import transformers
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+ from transformers import RobertaTokenizer, RobertaForSequenceClassification
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
13
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
14
+ import gradio as gr
15
+
16
+
17
+
18
+
19
+ def greet(co):
20
+ code_text = []
21
+ while True:
22
+ code = co
23
+ if not code:
24
+ break
25
+ code_text.append(code)
26
+ '''
27
+ iter_num = int(
28
+ input('false alarm์„ ๋ถ„๋ฅ˜ํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ž…๋ ฅํ•  ์ฝ”๋“œ์˜ ๊ฐฏ์ˆ˜๋Š” ๋ช‡๊ฐœ์ธ๊ฐ€์š”? (์ˆซ์ž๋งŒ ์ž…๋ ฅํ•˜์„ธ์š”.) : '))
29
+ code_text = []
30
+ for _ in range(iter_num):
31
+ code = input('์ฝ”๋“œ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š” : ')
32
+ code_text.append(code)
33
+ '''
34
+ code_text = ' '.join(code_text)
35
+ code_text = re.sub('\/\*[\S\s]*\*\/', '', code_text)
36
+ code_text = re.sub('\/\/.*', '', code_text)
37
+ code_text = re.sub('(\\\\n)+', '\\n', code_text)
38
+
39
+ # 1. CFA-CodeBERTa-small.pt -> CodeBERTa-small-v1 finetunig model
40
+ path = os.getcwd() + '/models/CFA-CodeBERTa-small.pt'
41
+ tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
42
+ input_ids = tokenizer.encode(
43
+ code_text, max_length=512, truncation=True, padding='max_length')
44
+ input_ids = torch.tensor([input_ids])
45
+ model = RobertaForSequenceClassification.from_pretrained(
46
+ path, num_labels=2)
47
+ model.to('cpu')
48
+ pred_1 = model(input_ids)[0].detach().cpu().numpy()[0]
49
+ # model(input_ids)[0].argmax().detach().cpu().numpy().item()
50
+
51
+ # 2. CFA-codebert-c.pt -> codebert-c finetuning model
52
+ path = os.getcwd() + '/models/CFA-codebert-c.pt'
53
+ tokenizer = AutoTokenizer.from_pretrained(path)
54
+ input_ids = tokenizer(code_text, padding=True, max_length=512,
55
+ truncation=True, return_token_type_ids=True)['input_ids']
56
+ input_ids = torch.tensor([input_ids])
57
+ model = AutoModelForSequenceClassification.from_pretrained(
58
+ path, num_labels=2)
59
+ pred_2 = model(input_ids)[0].detach().cpu().numpy()[0]
60
+
61
+ # 3. CFA-codebert-c-v2.pt -> undersampling + codebert-c finetuning model
62
+ path = os.getcwd() + '/models/CFA-codebert-c-v2.pt'
63
+ tokenizer = RobertaTokenizer.from_pretrained(path)
64
+ input_ids = tokenizer(code_text, padding=True, max_length=512,
65
+ truncation=True, return_token_type_ids=True)['input_ids']
66
+ input_ids = torch.tensor([input_ids])
67
+ model = RobertaForSequenceClassification.from_pretrained(
68
+ path, num_labels=2)
69
+ pred_3 = model(input_ids)[0].detach().cpu().numpy()
70
+
71
+ # 4. codeT5 finetuning model
72
+ path = os.getcwd() + '/models/CFA-codeT5'
73
+ model_params = {
74
+ # model_type: t5-base/t5-large
75
+ "MODEL": path,
76
+ "TRAIN_BATCH_SIZE": 8, # training batch size
77
+ "VALID_BATCH_SIZE": 8, # validation batch size
78
+ "VAL_EPOCHS": 1, # number of validation epochs
79
+ "MAX_SOURCE_TEXT_LENGTH": 512, # max length of source text
80
+ "MAX_TARGET_TEXT_LENGTH": 3, # max length of target text
81
+ "SEED": 2022, # set seed for reproducibility
82
+ }
83
+ data = pd.DataFrame({'code': [code_text]})
84
+ pred_4 = T5Trainer(
85
+ dataframe=data,
86
+ source_text="code",
87
+ model_params=model_params
88
+ )
89
+ pred_4 = int(pred_4[0])
90
+
91
+ # ensemble
92
+ tot_result = (pred_1 * 0.8 + pred_2 * 0.1 +
93
+ pred_3 * 0.1 + pred_4 * 0.1).argmax()
94
+
95
+ return tot_result
96
+
97
+
98
+
99
+ # codeT5
100
+ class YourDataSetClass(Dataset):
101
+
102
+ def __init__(
103
+ self, dataframe, tokenizer, source_len, source_text):
104
+
105
+ self.tokenizer = tokenizer
106
+ self.data = dataframe
107
+ self.source_len = source_len
108
+ # self.summ_len = target_len
109
+ # self.target_text = self.data[target_text]
110
+ self.source_text = self.data[source_text]
111
+
112
+ def __len__(self):
113
+ return len(self.source_text)
114
+
115
+ def __getitem__(self, index):
116
+
117
+ source_text = str(self.source_text[index])
118
+ source_text = " ".join(source_text.split())
119
+ source = self.tokenizer.batch_encode_plus(
120
+ [source_text],
121
+ max_length=self.source_len,
122
+ pad_to_max_length=True,
123
+ truncation=True,
124
+ padding="max_length",
125
+ return_tensors="pt",
126
+ )
127
+ source_ids = source["input_ids"].squeeze()
128
+ source_mask = source["attention_mask"].squeeze()
129
+ return {
130
+ "source_ids": source_ids.to(dtype=torch.long),
131
+ "source_mask": source_mask.to(dtype=torch.long),
132
+ }
133
+
134
+
135
+ def validate(epoch, tokenizer, model, device, loader):
136
+ model.eval()
137
+ predictions = []
138
+ with torch.no_grad():
139
+ for _, data in enumerate(loader, 0):
140
+ ids = data['source_ids'].to(device, dtype=torch.long)
141
+ mask = data['source_mask'].to(device, dtype=torch.long)
142
+
143
+ generated_ids = model.generate(
144
+ input_ids=ids,
145
+ attention_mask=mask,
146
+ max_length=150,
147
+ num_beams=2,
148
+ repetition_penalty=2.5,
149
+ length_penalty=1.0,
150
+ early_stopping=True
151
+ )
152
+
153
+ preds = [tokenizer.decode(
154
+ g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
155
+ if ((preds != '0') | (preds != '1')):
156
+ preds = '0'
157
+
158
+ predictions.extend(preds)
159
+ return predictions
160
+
161
+
162
+ def T5Trainer(dataframe, source_text, model_params, step="test",):
163
+
164
+ torch.manual_seed(model_params["SEED"]) # pytorch random seed
165
+ np.random.seed(model_params["SEED"]) # numpy random seed
166
+ torch.backends.cudnn.deterministic = True
167
+
168
+ tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
169
+
170
+ model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
171
+ model = model.to('cpu')
172
+
173
+ dataframe = dataframe[[source_text]]
174
+
175
+ val_dataset = dataframe
176
+ val_set = YourDataSetClass(
177
+ val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], source_text)
178
+
179
+ val_params = {
180
+ 'batch_size': model_params["VALID_BATCH_SIZE"],
181
+ 'shuffle': False,
182
+ 'num_workers': 0
183
+ }
184
+
185
+ val_loader = DataLoader(val_set, **val_params)
186
+
187
+ for epoch in range(model_params["VAL_EPOCHS"]):
188
+ predictions = validate(epoch, tokenizer, model, 'cpu', val_loader)
189
+
190
+ return predictions
191
+
192
+
193
+ #################################################################################
194
+
195
+ demo = gr.Interface(
196
+ fn = greet,
197
+ inputs = "text",
198
+ outputs= "number")
199
+ demo.launch(share=True)