arise-sustech
commited on
Commit
•
196150d
1
Parent(s):
c55a7fa
Update readme
Browse files- README.md +62 -143
- config.json +2 -3
- generation_config.json +1 -1
- tokenizer.json +6 -1
- tokenizer_config.json +1 -6
README.md
CHANGED
@@ -1,168 +1,87 @@
|
|
1 |
---
|
2 |
-
license:
|
3 |
-
|
4 |
-
|
|
|
5 |
---
|
6 |
|
|
|
7 |
|
8 |
-
|
9 |
-
<img width="1000px" alt="DeepSeek Coder" src="https://github.com/deepseek-ai/DeepSeek-Coder/blob/main/pictures/logo.png?raw=true">
|
10 |
-
</p>
|
11 |
-
<p align="center"><a href="https://www.deepseek.com/">[🏠Homepage]</a> | <a href="https://coder.deepseek.com/">[🤖 Chat with DeepSeek Coder]</a> | <a href="https://discord.gg/Tc7c45Zzu5">[Discord]</a> | <a href="https://github.com/guoday/assert/blob/main/QR.png?raw=true">[Wechat(微信)]</a> </p>
|
12 |
-
<hr>
|
13 |
|
|
|
14 |
|
15 |
|
16 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
Deepseek Coder is composed of a series of code language models, each trained from scratch on 2T tokens, with a composition of 87% code and 13% natural language in both English and Chinese. We provide various sizes of the code model, ranging from 1B to 33B versions. Each model is pre-trained on project-level code corpus by employing a window size of 16K and a extra fill-in-the-blank task, to support project-level code completion and infilling. For coding capabilities, Deepseek Coder achieves state-of-the-art performance among open-source code models on multiple programming languages and various benchmarks.
|
19 |
-
|
20 |
-
- **Massive Training Data**: Trained from scratch on 2T tokens, including 87% code and 13% linguistic data in both English and Chinese languages.
|
21 |
-
|
22 |
-
- **Highly Flexible & Scalable**: Offered in model sizes of 1.3B, 5.7B, 6.7B, and 33B, enabling users to choose the setup most suitable for their requirements.
|
23 |
-
|
24 |
-
- **Superior Model Performance**: State-of-the-art performance among publicly available code models on HumanEval, MultiPL-E, MBPP, DS-1000, and APPS benchmarks.
|
25 |
-
|
26 |
-
- **Advanced Code Completion Capabilities**: A window size of 16K and a fill-in-the-blank task, supporting project-level code completion and infilling tasks.
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
### 2. Model Summary
|
31 |
-
deepseek-coder-1.3b-base is a 1.3B parameter model with Multi-Head Attention trained on 1 trillion tokens.
|
32 |
-
- **Home Page:** [DeepSeek](https://deepseek.com/)
|
33 |
-
- **Repository:** [deepseek-ai/deepseek-coder](https://github.com/deepseek-ai/deepseek-coder)
|
34 |
-
- **Chat With DeepSeek Coder:** [DeepSeek-Coder](https://coder.deepseek.com/)
|
35 |
|
36 |
|
37 |
### 3. How to Use
|
38 |
-
Here give
|
39 |
-
|
40 |
-
```python
|
41 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
42 |
-
import torch
|
43 |
-
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True)
|
44 |
-
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True).cuda()
|
45 |
-
input_text = "#write a quick sort algorithm"
|
46 |
-
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
|
47 |
-
outputs = model.generate(**inputs, max_length=128)
|
48 |
-
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
49 |
-
```
|
50 |
-
|
51 |
-
#### 2)Code Insertion
|
52 |
```python
|
53 |
-
|
54 |
-
import
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
```
|
73 |
|
74 |
-
|
75 |
```python
|
76 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
77 |
-
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True)
|
78 |
-
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True).cuda()
|
79 |
-
|
80 |
-
input_text = """#utils.py
|
81 |
import torch
|
82 |
-
from sklearn import datasets
|
83 |
-
from sklearn.model_selection import train_test_split
|
84 |
-
from sklearn.preprocessing import StandardScaler
|
85 |
-
from sklearn.metrics import accuracy_score
|
86 |
-
|
87 |
-
def load_data():
|
88 |
-
iris = datasets.load_iris()
|
89 |
-
X = iris.data
|
90 |
-
y = iris.target
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
y_test = torch.tensor(y_test, dtype=torch.int64)
|
103 |
-
|
104 |
-
return X_train, X_test, y_train, y_test
|
105 |
-
|
106 |
-
def evaluate_predictions(y_test, y_pred):
|
107 |
-
return accuracy_score(y_test, y_pred)
|
108 |
-
#model.py
|
109 |
-
import torch
|
110 |
-
import torch.nn as nn
|
111 |
-
import torch.optim as optim
|
112 |
-
from torch.utils.data import DataLoader, TensorDataset
|
113 |
-
|
114 |
-
class IrisClassifier(nn.Module):
|
115 |
-
def __init__(self):
|
116 |
-
super(IrisClassifier, self).__init__()
|
117 |
-
self.fc = nn.Sequential(
|
118 |
-
nn.Linear(4, 16),
|
119 |
-
nn.ReLU(),
|
120 |
-
nn.Linear(16, 3)
|
121 |
-
)
|
122 |
-
|
123 |
-
def forward(self, x):
|
124 |
-
return self.fc(x)
|
125 |
-
|
126 |
-
def train_model(self, X_train, y_train, epochs, lr, batch_size):
|
127 |
-
criterion = nn.CrossEntropyLoss()
|
128 |
-
optimizer = optim.Adam(self.parameters(), lr=lr)
|
129 |
-
|
130 |
-
# Create DataLoader for batches
|
131 |
-
dataset = TensorDataset(X_train, y_train)
|
132 |
-
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
133 |
-
|
134 |
-
for epoch in range(epochs):
|
135 |
-
for batch_X, batch_y in dataloader:
|
136 |
-
optimizer.zero_grad()
|
137 |
-
outputs = self(batch_X)
|
138 |
-
loss = criterion(outputs, batch_y)
|
139 |
-
loss.backward()
|
140 |
-
optimizer.step()
|
141 |
-
|
142 |
-
def predict(self, X_test):
|
143 |
-
with torch.no_grad():
|
144 |
-
outputs = self(X_test)
|
145 |
-
_, predicted = outputs.max(1)
|
146 |
-
return predicted.numpy()
|
147 |
-
#main.py
|
148 |
-
from utils import load_data, evaluate_predictions
|
149 |
-
from model import IrisClassifier as Classifier
|
150 |
-
|
151 |
-
def main():
|
152 |
-
# Model training and evaluation
|
153 |
-
"""
|
154 |
-
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
|
155 |
-
outputs = model.generate(**inputs, max_new_tokens=140)
|
156 |
-
print(tokenizer.decode(outputs[0]))
|
157 |
```
|
158 |
|
159 |
-
|
160 |
-
|
161 |
### 4. License
|
162 |
-
This code repository is licensed under the MIT License.
|
163 |
-
|
164 |
-
See the [LICENSE-MODEL](https://github.com/deepseek-ai/deepseek-coder/blob/main/LICENSE-MODEL) for more details.
|
165 |
|
166 |
### 5. Contact
|
167 |
|
168 |
-
If you have any questions, please raise an issue
|
|
|
1 |
---
|
2 |
+
license: mit
|
3 |
+
tags:
|
4 |
+
- decompile
|
5 |
+
- binary
|
6 |
---
|
7 |
|
8 |
+
### 1. Introduction of LLM4Decompile
|
9 |
|
10 |
+
LLM4Decompile aims to decompile x86 assembly instructions into C. It is finetuned from Deepseek-Coder on 4B tokens of assembly-C pairs compiled from AnghaBench.
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
- **Github Repository:** [LLM4Compile](https://github.com/albertan017/LLM4Decompile)
|
13 |
|
14 |
|
15 |
+
### 2. Evaluation Results
|
16 |
+
| Model | Re-compilability | | | | | Re-executability | | | | |
|
17 |
+
|--------------------|:----------------:|:---------:|:---------:|:---------:|:---------:|:----------------:|-----------|-----------|-----------|:---------:|
|
18 |
+
| Optimization-level | O0 | O1 | O2 | O3 | Avg. | O0 | O1 | O2 | O3 | Avg. |
|
19 |
+
| GPT4 | 0.92 | 0.94 | 0.88 | 0.84 | 0.895 | 0.1341 | 0.1890 | 0.1524 | 0.0854 | 0.1402 |
|
20 |
+
| DeepSeek-Coder-33B | 0.0659 | 0.0866 | 0.1500 | 0.1463 | 0.1122 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 |
|
21 |
+
| LLM4Decompile-1b | 0.8780 | 0.8732 | 0.8683 | 0.8378 | 0.8643 | 0.1573 | 0.0768 | 0.1000 | 0.0878 | 0.1055 |
|
22 |
+
| LLM4Decompile-6b | 0.8817 | 0.8951 | 0.8671 | 0.8476 | 0.8729 | 0.3000 | 0.1732 | 0.1988 | 0.1841 | 0.2140 |
|
23 |
+
| LLM4Decompile-33b | 0.8134 | 0.8195 | 0.8183 | 0.8305 | 0.8204 | 0.3049 | 0.1902 | 0.1817 | 0.1817 | 0.2146 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
|
27 |
### 3. How to Use
|
28 |
+
Here give an example of how to use our model.
|
29 |
+
First compile the C code into binary, disassemble the binary into assembly instructions:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
```python
|
31 |
+
import subprocess
|
32 |
+
import os
|
33 |
+
import re
|
34 |
+
|
35 |
+
digit_pattern = r'\b0x[a-fA-F0-9]+\b'# binary codes in Hexadecimal
|
36 |
+
zeros_pattern = r'^0+\s'#0s
|
37 |
+
OPT = ["O0", "O1", "O2", "O3"]
|
38 |
+
before = f"# This is the assembly code with {opt_state} optimization:\n"
|
39 |
+
after = "\n# What is the source code?\n"
|
40 |
+
fileName = 'path/to/file'
|
41 |
+
with open(fileName+'.c','r') as f:#original file
|
42 |
+
c_func = f.read()
|
43 |
+
for opt_state in OPT:
|
44 |
+
output_file = fileName +'_' + opt_state
|
45 |
+
input_file = fileName+'.c'
|
46 |
+
compile_command = f'gcc -c -o {output_file}.o {input_file} -{opt_state} -lm'#compile the code with GCC on Linux
|
47 |
+
subprocess.run(compile_command, shell=True, check=True)
|
48 |
+
compile_command = f'objdump -d {output_file}.o > {output_file}.s'#disassemble the binary file into assembly instructions
|
49 |
+
subprocess.run(compile_command, shell=True, check=True)
|
50 |
+
|
51 |
+
input_asm = ''
|
52 |
+
asm = read_file(output_file+'.s')
|
53 |
+
asm = asm.split('Disassembly of section .text:')[-1].strip()
|
54 |
+
for tmp in asm.split('\n'):
|
55 |
+
tmp_asm = tmp.split('\t')[-1]#remove the binary code
|
56 |
+
tmp_asm = tmp_asm.split('#')[0].strip()#remove the comments
|
57 |
+
input_asm+=tmp_asm+'\n'
|
58 |
+
input_asm = re.sub(zeros_pattern, '', input_asm)
|
59 |
+
|
60 |
+
input_asm_prompt = before+input_asm.strip()+after
|
61 |
+
with open(fileName +'_' + opt_state +'.asm','w',encoding='utf-8') as f:
|
62 |
+
f.write(input_asm_prompt)
|
63 |
```
|
64 |
|
65 |
+
Then use LLM4Decompile to translate the assembly instructions into C:
|
66 |
```python
|
67 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
|
|
|
|
68 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
model_path = 'arise-sustech/llm4decompile-1.3b'
|
71 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
72 |
+
model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
|
73 |
|
74 |
+
with open(fileName +'_' + opt_state +'.asm','r') as f:#original file
|
75 |
+
asm_func = f.read()
|
76 |
+
inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
|
77 |
+
with torch.no_grad():
|
78 |
+
outputs = model.generate(**inputs, max_new_tokens=512)
|
79 |
+
c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
```
|
81 |
|
|
|
|
|
82 |
### 4. License
|
83 |
+
This code repository is licensed under the MIT License.
|
|
|
|
|
84 |
|
85 |
### 5. Contact
|
86 |
|
87 |
+
If you have any questions, please raise an issue.
|
config.json
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"LlamaForCausalLM"
|
5 |
],
|
6 |
"attention_bias": false,
|
7 |
-
"attention_dropout": 0.0,
|
8 |
"bos_token_id": 32013,
|
9 |
"eos_token_id": 32014,
|
10 |
"hidden_act": "silu",
|
@@ -25,7 +24,7 @@
|
|
25 |
"rope_theta": 100000,
|
26 |
"tie_word_embeddings": false,
|
27 |
"torch_dtype": "bfloat16",
|
28 |
-
"transformers_version": "4.
|
29 |
"use_cache": false,
|
30 |
"vocab_size": 32256
|
31 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "/share/luoqi/models/deepseek-coder-1.3b-base",
|
3 |
"architectures": [
|
4 |
"LlamaForCausalLM"
|
5 |
],
|
6 |
"attention_bias": false,
|
|
|
7 |
"bos_token_id": 32013,
|
8 |
"eos_token_id": 32014,
|
9 |
"hidden_act": "silu",
|
|
|
24 |
"rope_theta": 100000,
|
25 |
"tie_word_embeddings": false,
|
26 |
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.34.1",
|
28 |
"use_cache": false,
|
29 |
"vocab_size": 32256
|
30 |
}
|
generation_config.json
CHANGED
@@ -2,5 +2,5 @@
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 32013,
|
4 |
"eos_token_id": 32014,
|
5 |
-
"transformers_version": "4.
|
6 |
}
|
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 32013,
|
4 |
"eos_token_id": 32014,
|
5 |
+
"transformers_version": "4.34.1"
|
6 |
}
|
tokenizer.json
CHANGED
@@ -1,6 +1,11 @@
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
-
"truncation":
|
|
|
|
|
|
|
|
|
|
|
4 |
"padding": null,
|
5 |
"added_tokens": [
|
6 |
{
|
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
+
"truncation": {
|
4 |
+
"direction": "Right",
|
5 |
+
"max_length": 16384,
|
6 |
+
"strategy": "LongestFirst",
|
7 |
+
"stride": 0
|
8 |
+
},
|
9 |
"padding": null,
|
10 |
"added_tokens": [
|
11 |
{
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
{
|
2 |
-
"add_bos_token": true,
|
3 |
-
"add_eos_token": false,
|
4 |
"added_tokens_decoder": {
|
5 |
"32000": {
|
6 |
"content": "õ",
|
@@ -180,16 +178,13 @@
|
|
180 |
}
|
181 |
},
|
182 |
"bos_token": "<|begin▁of▁sentence|>",
|
183 |
-
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content }}{% endif %}{% endfor %}",
|
184 |
"clean_up_tokenization_spaces": false,
|
185 |
"eos_token": "<|end▁of▁sentence|>",
|
186 |
"legacy": true,
|
187 |
"model_max_length": 16384,
|
188 |
"pad_token": "<|end▁of▁sentence|>",
|
189 |
-
"padding_side": "right",
|
190 |
"sp_model_kwargs": {},
|
191 |
-
"split_special_tokens": false,
|
192 |
"tokenizer_class": "LlamaTokenizer",
|
193 |
"unk_token": null,
|
194 |
-
"use_default_system_prompt":
|
195 |
}
|
|
|
1 |
{
|
|
|
|
|
2 |
"added_tokens_decoder": {
|
3 |
"32000": {
|
4 |
"content": "õ",
|
|
|
178 |
}
|
179 |
},
|
180 |
"bos_token": "<|begin▁of▁sentence|>",
|
|
|
181 |
"clean_up_tokenization_spaces": false,
|
182 |
"eos_token": "<|end▁of▁sentence|>",
|
183 |
"legacy": true,
|
184 |
"model_max_length": 16384,
|
185 |
"pad_token": "<|end▁of▁sentence|>",
|
|
|
186 |
"sp_model_kwargs": {},
|
|
|
187 |
"tokenizer_class": "LlamaTokenizer",
|
188 |
"unk_token": null,
|
189 |
+
"use_default_system_prompt": true
|
190 |
}
|