File size: 5,534 Bytes
8afc956 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import os
from typing import Dict, List, Optional, Tuple, Union
import torch
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from transformers import AutoModel, AutoTokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
def torch_gc():
if torch.cuda.is_available():
with torch.cuda.device(CUDA_DEVICE):
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
# transformer.word_embeddings 占用1层
# transformer.final_layernorm 和 lm_head 占用1层
# transformer.layers 占用 28 层
# 总共30层分配到num_gpus张卡上
num_trans_layers = 28
per_gpu_layers = 30 / num_gpus
# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device
# linux下 model.device 会被设置成 lm_head.device
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
device_map = {'transformer.word_embeddings': 0,
'transformer.final_layernorm': 0, 'lm_head': 0}
used = 2
gpu_target = 0
for i in range(num_trans_layers):
if used >= per_gpu_layers:
gpu_target += 1
used = 0
assert gpu_target < num_gpus
device_map[f'transformer.layers.{i}'] = gpu_target
used += 1
return device_map
class ChatLLM(LLM):
max_token: int = 10000
temperature: float = 0.1
top_p = 0.9
history = []
tokenizer: object = None
model: object = None
def __init__(self):
super().__init__()
@property
def _llm_type(self) -> str:
return "ChatLLM"
def _call(self,
prompt: str,
stop: Optional[List[str]] = None) -> str:
if self.model == 'Minimax':
import requests
group_id = os.getenv('group_id')
api_key = os.getenv('api_key')
url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}'
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
request_body = {
"model": "abab5-chat",
"tokens_to_generate": 512,
'messages': []
}
for i in self.history:
h_input = i[0]
h_reply = i[1]
request_body['messages'].append({
"sender_type": "USER",
"text": h_input
})
request_body['messages'].append({"sender_type": "BOT", "text": h_reply})
request_body['messages'].append({"sender_type": "USER", "text": prompt})
resp = requests.post(url, headers=headers, json=request_body)
response = resp.json()['reply']
# 将当次的ai回复内容加入messages
request_body['messages'].append({"sender_type": "BOT", "text": response})
self.history.append((prompt, response))
else:
response, _ = self.model.chat(
self.tokenizer,
prompt,
history=self.history,
max_length=self.max_token,
temperature=self.temperature,
)
torch_gc()
if stop is not None:
response = enforce_stop_tokens(response, stop)
self.history = self.history+[[None, response]]
return response
def load_model(self,
model_name_or_path: str = "THUDM/chatglm-6b-int4",
llm_device=DEVICE,
device_map: Optional[Dict[str, int]] = None,
**kwargs):
self.tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True
)
if torch.cuda.is_available() and llm_device.lower().startswith("cuda"):
# 根据当前设备GPU数量决定是否进行多卡部署
num_gpus = torch.cuda.device_count()
if num_gpus < 2 and device_map is None:
self.model = (
AutoModel.from_pretrained(
model_name_or_path,
trust_remote_code=True,
**kwargs)
.half()
.cuda()
)
else:
from accelerate import dispatch_model
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half()
# 可传入device_map自定义每张卡的部署情况
if device_map is None:
device_map = auto_configure_device_map(num_gpus)
self.model = dispatch_model(model, device_map=device_map)
else:
self.model = (
AutoModel.from_pretrained(
model_name_or_path,
trust_remote_code=True)
.float()
.to(llm_device)
)
self.model = self.model.eval() |