Update README.md
Browse files
README.md
CHANGED
@@ -15,26 +15,9 @@ ERNIE-Code is a unified large language model (LLM) that connects 116 natural lan
|
|
15 |
[ACL 2023 (Findings)](https://aclanthology.org/2023.findings-acl.676/) | [arXiv](https://arxiv.org/pdf/2212.06742)
|
16 |
|
17 |
|
18 |
-
###
|
19 |
|
20 |
-
First preprocess the input prompt:
|
21 |
-
```python
|
22 |
-
def clean_up_code_spaces(s: str):
|
23 |
-
# post process
|
24 |
-
# ===========================
|
25 |
-
new_tokens = ["<pad>", "</s>", "<unk>", "\n", "\t", "<|space|>"*4, "<|space|>"*2, "<|space|>"]
|
26 |
-
for tok in new_tokens:
|
27 |
-
s = s.replace(f"{tok} ", tok)
|
28 |
-
|
29 |
-
cleaned_tokens = ["<pad>", "</s>", "<unk>"]
|
30 |
-
for tok in cleaned_tokens:
|
31 |
-
s = s.replace(tok, "")
|
32 |
-
s = s.replace("<|space|>", " ")
|
33 |
-
# ===========================
|
34 |
-
return s
|
35 |
-
```
|
36 |
|
37 |
-
Then use `transformers` to load the model:
|
38 |
```python
|
39 |
import torch
|
40 |
from transformers import (
|
@@ -49,22 +32,72 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
49 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
50 |
# note that you can use aforementioned `clean_up_code_spaces` to proprocess the code
|
51 |
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
model_inputs = tokenizer(prompt, max_length=512, padding=False, truncation=True, return_tensors="pt")
|
|
|
|
|
56 |
input_ids = model_inputs.input_ids.cuda() # by default
|
57 |
attention_mask = model_inputs.attention_mask.cuda() # by default
|
58 |
|
59 |
output = model.generate(input_ids=input_ids, attention_mask=attention_mask,
|
60 |
-
num_beams=5, max_length=
|
61 |
|
62 |
-
# Ensure to customize the post-processing of clean_up_code_spaces output according to specific requirements.
|
63 |
output = tokenizer.decode(output.flatten(), skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
```
|
65 |
|
|
|
|
|
66 |
You can also check the official inference code on [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/ernie-code/README.en.md).
|
67 |
|
|
|
68 |
### Zero-shot Examples
|
69 |
- Multilingual code-to-text generation (zero-shot)
|
70 |
|
|
|
15 |
[ACL 2023 (Findings)](https://aclanthology.org/2023.findings-acl.676/) | [arXiv](https://arxiv.org/pdf/2212.06742)
|
16 |
|
17 |
|
18 |
+
### Usage
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
|
|
21 |
```python
|
22 |
import torch
|
23 |
from transformers import (
|
|
|
32 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
33 |
# note that you can use aforementioned `clean_up_code_spaces` to proprocess the code
|
34 |
|
35 |
+
|
36 |
+
def format_code_with_spm_compatablity(line: str):
|
37 |
+
format_dict = {
|
38 |
+
" " : "<|space|>"
|
39 |
+
}
|
40 |
+
tokens = list(line)
|
41 |
+
i = 0
|
42 |
+
while i < len(tokens):
|
43 |
+
if line[i] == "\n":
|
44 |
+
while i+1 < len(tokens) and tokens[i+1] == " ":
|
45 |
+
tokens[i+1] = format_dict.get(" ")
|
46 |
+
i += 1
|
47 |
+
i += 1
|
48 |
+
formatted_line = ''.join(tokens)
|
49 |
+
return formatted_line
|
50 |
+
|
51 |
+
|
52 |
+
TYPE="code" # define input type in ("code", "text")
|
53 |
+
input="arr.sort()"
|
54 |
+
prompt="translate python to java: \n%s" % (input) # your prompt here
|
55 |
+
|
56 |
+
TYPE="text" # define input type in ("code", "text")
|
57 |
+
input="quick sort"
|
58 |
+
prompt="translate English to Japanese: \n%s" % (input) # your prompt here
|
59 |
+
|
60 |
+
assert TYPE in ("code", "text")
|
61 |
+
|
62 |
+
# preprocess for code input
|
63 |
+
if TYPE=="code":
|
64 |
+
prompt = format_code_with_spm_compatablity(prompt)
|
65 |
|
66 |
model_inputs = tokenizer(prompt, max_length=512, padding=False, truncation=True, return_tensors="pt")
|
67 |
+
|
68 |
+
model = model.cuda() # by default
|
69 |
input_ids = model_inputs.input_ids.cuda() # by default
|
70 |
attention_mask = model_inputs.attention_mask.cuda() # by default
|
71 |
|
72 |
output = model.generate(input_ids=input_ids, attention_mask=attention_mask,
|
73 |
+
num_beams=5, max_length=20) # change to your needs
|
74 |
|
75 |
+
# Ensure to customize the post-processing of `clean_up_code_spaces` output according to specific requirements.
|
76 |
output = tokenizer.decode(output.flatten(), skip_special_tokens=True)
|
77 |
+
|
78 |
+
|
79 |
+
# post-process the code generation
|
80 |
+
def clean_up_code_spaces(s: str):
|
81 |
+
# post process
|
82 |
+
# ===========================
|
83 |
+
new_tokens = ["<pad>", "</s>", "<unk>", "\n", "\t", "<|space|>"*4, "<|space|>"*2, "<|space|>"]
|
84 |
+
for tok in new_tokens:
|
85 |
+
s = s.replace(f"{tok} ", tok)
|
86 |
+
|
87 |
+
cleaned_tokens = ["<pad>", "</s>", "<unk>"]
|
88 |
+
for tok in cleaned_tokens:
|
89 |
+
s = s.replace(tok, "")
|
90 |
+
s = s.replace("<|space|>", " ")
|
91 |
+
return s
|
92 |
+
output = [clean_up_code_spaces(pred) for pred in output]
|
93 |
+
|
94 |
```
|
95 |
|
96 |
+
You can adapt [seq2seq translation code](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) for finetuning.
|
97 |
+
|
98 |
You can also check the official inference code on [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/ernie-code/README.en.md).
|
99 |
|
100 |
+
|
101 |
### Zero-shot Examples
|
102 |
- Multilingual code-to-text generation (zero-shot)
|
103 |
|