|
|
|
import os |
|
from typing import Dict, List, Optional, Tuple, Union |
|
|
|
import torch |
|
from langchain.llms.base import LLM |
|
from langchain.llms.utils import enforce_stop_tokens |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
DEVICE = "cuda" |
|
DEVICE_ID = "0" |
|
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE |
|
|
|
|
|
def torch_gc(): |
|
if torch.cuda.is_available(): |
|
with torch.cuda.device(CUDA_DEVICE): |
|
torch.cuda.empty_cache() |
|
torch.cuda.ipc_collect() |
|
|
|
def auto_configure_device_map(num_gpus: int) -> Dict[str, int]: |
|
|
|
|
|
|
|
|
|
num_trans_layers = 28 |
|
per_gpu_layers = 30 / num_gpus |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
device_map = {'transformer.word_embeddings': 0, |
|
'transformer.final_layernorm': 0, 'lm_head': 0} |
|
|
|
used = 2 |
|
gpu_target = 0 |
|
for i in range(num_trans_layers): |
|
if used >= per_gpu_layers: |
|
gpu_target += 1 |
|
used = 0 |
|
assert gpu_target < num_gpus |
|
device_map[f'transformer.layers.{i}'] = gpu_target |
|
used += 1 |
|
|
|
return device_map |
|
|
|
|
|
|
|
class ChatLLM(LLM): |
|
max_token: int = 10000 |
|
temperature: float = 0.1 |
|
top_p = 0.9 |
|
history = [] |
|
tokenizer: object = None |
|
model: object = None |
|
|
|
def __init__(self): |
|
super().__init__() |
|
|
|
@property |
|
def _llm_type(self) -> str: |
|
return "ChatLLM" |
|
|
|
def _call(self, |
|
prompt: str, |
|
stop: Optional[List[str]] = None) -> str: |
|
|
|
if self.model == 'Minimax': |
|
import requests |
|
|
|
group_id = os.getenv('group_id') |
|
api_key = os.getenv('api_key') |
|
|
|
url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}' |
|
headers = { |
|
"Authorization": f"Bearer {api_key}", |
|
"Content-Type": "application/json" |
|
} |
|
request_body = { |
|
"model": "abab5-chat", |
|
"tokens_to_generate": 512, |
|
'messages': [] |
|
} |
|
|
|
for i in self.history: |
|
h_input = i[0] |
|
h_reply = i[1] |
|
request_body['messages'].append({ |
|
"sender_type": "USER", |
|
"text": h_input |
|
}) |
|
request_body['messages'].append({"sender_type": "BOT", "text": h_reply}) |
|
|
|
request_body['messages'].append({"sender_type": "USER", "text": prompt}) |
|
resp = requests.post(url, headers=headers, json=request_body) |
|
response = resp.json()['reply'] |
|
|
|
request_body['messages'].append({"sender_type": "BOT", "text": response}) |
|
self.history.append((prompt, response)) |
|
|
|
else: |
|
|
|
response, _ = self.model.chat( |
|
self.tokenizer, |
|
prompt, |
|
history=self.history, |
|
max_length=self.max_token, |
|
temperature=self.temperature, |
|
) |
|
torch_gc() |
|
if stop is not None: |
|
response = enforce_stop_tokens(response, stop) |
|
self.history = self.history+[[None, response]] |
|
return response |
|
|
|
def load_model(self, |
|
model_name_or_path: str = "THUDM/chatglm-6b-int4", |
|
llm_device=DEVICE, |
|
device_map: Optional[Dict[str, int]] = None, |
|
**kwargs): |
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
model_name_or_path, |
|
trust_remote_code=True |
|
) |
|
if torch.cuda.is_available() and llm_device.lower().startswith("cuda"): |
|
|
|
num_gpus = torch.cuda.device_count() |
|
if num_gpus < 2 and device_map is None: |
|
self.model = ( |
|
AutoModel.from_pretrained( |
|
model_name_or_path, |
|
trust_remote_code=True, |
|
**kwargs) |
|
.half() |
|
.cuda() |
|
) |
|
else: |
|
from accelerate import dispatch_model |
|
|
|
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half() |
|
|
|
if device_map is None: |
|
device_map = auto_configure_device_map(num_gpus) |
|
|
|
self.model = dispatch_model(model, device_map=device_map) |
|
else: |
|
self.model = ( |
|
AutoModel.from_pretrained( |
|
model_name_or_path, |
|
trust_remote_code=True) |
|
.float() |
|
.to(llm_device) |
|
) |
|
self.model = self.model.eval() |