cyk1337 commited on
Commit
1e9a811
·
verified ·
1 Parent(s): d2195a8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +55 -22
README.md CHANGED
@@ -15,26 +15,9 @@ ERNIE-Code is a unified large language model (LLM) that connects 116 natural lan
15
  [ACL 2023 (Findings)](https://aclanthology.org/2023.findings-acl.676/) | [arXiv](https://arxiv.org/pdf/2212.06742)
16
 
17
 
18
- ### Multilingual Text-to-Code / Code-to-Text
19
 
20
- First preprocess the input prompt:
21
- ```python
22
- def clean_up_code_spaces(s: str):
23
- # post process
24
- # ===========================
25
- new_tokens = ["<pad>", "</s>", "<unk>", "\n", "\t", "<|space|>"*4, "<|space|>"*2, "<|space|>"]
26
- for tok in new_tokens:
27
- s = s.replace(f"{tok} ", tok)
28
-
29
- cleaned_tokens = ["<pad>", "</s>", "<unk>"]
30
- for tok in cleaned_tokens:
31
- s = s.replace(tok, "")
32
- s = s.replace("<|space|>", " ")
33
- # ===========================
34
- return s
35
- ```
36
 
37
- Then use `transformers` to load the model:
38
  ```python
39
  import torch
40
  from transformers import (
@@ -49,22 +32,72 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
49
  tokenizer = AutoTokenizer.from_pretrained(model_name)
50
  # note that you can use aforementioned `clean_up_code_spaces` to proprocess the code
51
 
52
- input_code="快速排序"
53
- prompt="translate Chinese to English: \n%s" % (input_code) # your prompt here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  model_inputs = tokenizer(prompt, max_length=512, padding=False, truncation=True, return_tensors="pt")
 
 
56
  input_ids = model_inputs.input_ids.cuda() # by default
57
  attention_mask = model_inputs.attention_mask.cuda() # by default
58
 
59
  output = model.generate(input_ids=input_ids, attention_mask=attention_mask,
60
- num_beams=5, max_length=512) # change to your own decoding methods
61
 
62
- # Ensure to customize the post-processing of clean_up_code_spaces output according to specific requirements.
63
  output = tokenizer.decode(output.flatten(), skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  ```
65
 
 
 
66
  You can also check the official inference code on [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/ernie-code/README.en.md).
67
 
 
68
  ### Zero-shot Examples
69
  - Multilingual code-to-text generation (zero-shot)
70
 
 
15
  [ACL 2023 (Findings)](https://aclanthology.org/2023.findings-acl.676/) | [arXiv](https://arxiv.org/pdf/2212.06742)
16
 
17
 
18
+ ### Usage
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
21
  ```python
22
  import torch
23
  from transformers import (
 
32
  tokenizer = AutoTokenizer.from_pretrained(model_name)
33
  # note that you can use aforementioned `clean_up_code_spaces` to proprocess the code
34
 
35
+
36
+ def format_code_with_spm_compatablity(line: str):
37
+ format_dict = {
38
+ " " : "<|space|>"
39
+ }
40
+ tokens = list(line)
41
+ i = 0
42
+ while i < len(tokens):
43
+ if line[i] == "\n":
44
+ while i+1 < len(tokens) and tokens[i+1] == " ":
45
+ tokens[i+1] = format_dict.get(" ")
46
+ i += 1
47
+ i += 1
48
+ formatted_line = ''.join(tokens)
49
+ return formatted_line
50
+
51
+
52
+ TYPE="code" # define input type in ("code", "text")
53
+ input="arr.sort()"
54
+ prompt="translate python to java: \n%s" % (input) # your prompt here
55
+
56
+ TYPE="text" # define input type in ("code", "text")
57
+ input="quick sort"
58
+ prompt="translate English to Japanese: \n%s" % (input) # your prompt here
59
+
60
+ assert TYPE in ("code", "text")
61
+
62
+ # preprocess for code input
63
+ if TYPE=="code":
64
+ prompt = format_code_with_spm_compatablity(prompt)
65
 
66
  model_inputs = tokenizer(prompt, max_length=512, padding=False, truncation=True, return_tensors="pt")
67
+
68
+ model = model.cuda() # by default
69
  input_ids = model_inputs.input_ids.cuda() # by default
70
  attention_mask = model_inputs.attention_mask.cuda() # by default
71
 
72
  output = model.generate(input_ids=input_ids, attention_mask=attention_mask,
73
+ num_beams=5, max_length=20) # change to your needs
74
 
75
+ # Ensure to customize the post-processing of `clean_up_code_spaces` output according to specific requirements.
76
  output = tokenizer.decode(output.flatten(), skip_special_tokens=True)
77
+
78
+
79
+ # post-process the code generation
80
+ def clean_up_code_spaces(s: str):
81
+ # post process
82
+ # ===========================
83
+ new_tokens = ["<pad>", "</s>", "<unk>", "\n", "\t", "<|space|>"*4, "<|space|>"*2, "<|space|>"]
84
+ for tok in new_tokens:
85
+ s = s.replace(f"{tok} ", tok)
86
+
87
+ cleaned_tokens = ["<pad>", "</s>", "<unk>"]
88
+ for tok in cleaned_tokens:
89
+ s = s.replace(tok, "")
90
+ s = s.replace("<|space|>", " ")
91
+ return s
92
+ output = [clean_up_code_spaces(pred) for pred in output]
93
+
94
  ```
95
 
96
+ You can adapt [seq2seq translation code](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) for finetuning.
97
+
98
  You can also check the official inference code on [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/ernie-code/README.en.md).
99
 
100
+
101
  ### Zero-shot Examples
102
  - Multilingual code-to-text generation (zero-shot)
103