Gong Baitao
commited on
Commit
•
782358e
1
Parent(s):
8a24b4e
Update modeling_cpmbee.py and README.md
Browse files- README.md +36 -1
- modeling_cpmbee.py +2 -2
README.md
CHANGED
@@ -67,4 +67,39 @@ res = model.generate(
|
|
67 |
)
|
68 |
print(res)
|
69 |
|
70 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
)
|
68 |
print(res)
|
69 |
|
70 |
+
```
|
71 |
+
|
72 |
+
We suggest to use `bmtrain` to finetune CPM-Bee. Also, you can use `accelerate` and `deepspeed` to finetune CPM-Bee. Here we will give a brief example of a training loop:
|
73 |
+
|
74 |
+
```python
|
75 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
76 |
+
from accelerate import Accelerator
|
77 |
+
from torch.utils.data import Dataset, DataLoader
|
78 |
+
|
79 |
+
accelerator = Accelerator()
|
80 |
+
|
81 |
+
trainset = Dataset() # Make sure trainset.__getitem__() can get data with correct format like {"input": "...", "<ans>": ""}
|
82 |
+
# for details, you can read https://github.com/OpenBMB/CPM-Bee/tree/main/tutorials/basic_task_finetune
|
83 |
+
train_loader = DataLoader(trainset, batch_size=1)
|
84 |
+
|
85 |
+
tokenizer = AutoTokenizer.from_pretrained("openbmb/cpm-bee-5b", trust_remote_code=True)
|
86 |
+
model = AutoModelForCausalLM.from_pretrained("openbmb/cpm-bee-5b", trust_remote_code=True).cuda()
|
87 |
+
|
88 |
+
optimizer = torch.optim.Adam(model.parameters())
|
89 |
+
|
90 |
+
model, optimizer, train_loader = accelerator.prepare(
|
91 |
+
model, optimizer, train_loader
|
92 |
+
)
|
93 |
+
|
94 |
+
for iter, data in enumerate(train_loader):
|
95 |
+
optimizer.zero_grad()
|
96 |
+
|
97 |
+
# change the data to a trainable format
|
98 |
+
input_encoded = tokenizer.prepare_for_finetune(data, max_length=512).to(model.device)
|
99 |
+
|
100 |
+
outputs = model(**input_encoded)
|
101 |
+
loss = outputs.loss
|
102 |
+
accelerator.backward(loss)
|
103 |
+
optimizer.step()
|
104 |
+
```
|
105 |
+
You should design your own parallel and mix_precision training strategy on the basis of it.
|
modeling_cpmbee.py
CHANGED
@@ -569,10 +569,10 @@ class CpmBeeRotaryEmbedding(nn.Module):
|
|
569 |
self.inv_freq = inv_freq.to(config.torch_dtype)
|
570 |
|
571 |
def forward(self, x: torch.Tensor, x_pos: torch.Tensor):
|
572 |
-
inv_freq = self.inv_freq.to(device=x.device, dtype=
|
573 |
|
574 |
x_pos = x_pos * self.distance_scale
|
575 |
-
freqs = x_pos[..., None]
|
576 |
|
577 |
emb = torch.cat((freqs, freqs), dim=-1) # (..., dim)
|
578 |
emb_cos = emb.cos() # (..., dim)
|
|
|
569 |
self.inv_freq = inv_freq.to(config.torch_dtype)
|
570 |
|
571 |
def forward(self, x: torch.Tensor, x_pos: torch.Tensor):
|
572 |
+
inv_freq = self.inv_freq.to(device=x.device, dtype=x.dtype)
|
573 |
|
574 |
x_pos = x_pos * self.distance_scale
|
575 |
+
freqs = x_pos[..., None] * inv_freq[None, :] # (..., dim/2)
|
576 |
|
577 |
emb = torch.cat((freqs, freqs), dim=-1) # (..., dim)
|
578 |
emb_cos = emb.cos() # (..., dim)
|