File size: 2,423 Bytes
c821aa0 b6dcc1d c821aa0 b6dcc1d c821aa0 b6dcc1d c821aa0 b6dcc1d c821aa0 b6dcc1d c821aa0 b6dcc1d c821aa0 b6dcc1d c821aa0 b6dcc1d c821aa0 b6dcc1d 69bf2b0 b6dcc1d 69bf2b0 b6dcc1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from transformers import PreTrainedModel
from .MoEConfig import MoEConfig
from transformers import AutoModelForCausalLM
import torch
import numpy as np
class MoeModel(PreTrainedModel):
config_class = MoEConfig
verbose = True
fix_mode = False
def __init__(self, config):
super().__init__(config)
self.model_list = []
for model_name in self.config_class.model_list:
self.append_model(model_name)
self.set_model_id(0)
"""
def set_model(self, model_name):
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.float16
)
"""
def append_model(self, model_name):
print("loading ", model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.float16
)
self.model_list.append(model)
# def set_tokenizer(self, tokenizer):
# self.tokenizer = tokenizer
def set_model_id(self, model_id):
self.model = self.model_list[model_id]
def calc_perplexity(self, tokenized_input):
ppl_list = []
for model in self.model_list:
ppl_list.append(perplexity(model, tokenized_input))
return np.array(ppl_list)
def fix_model(self, model_id):
self.set_model_id(model_id)
self.fix_mode = True
def set_flexible_mode(self):
self.fix_mode = False
def generate(self, input_ids, attention_mask,
**generate_kwargs):
if not self.fix_mode:
ppl_array = self.calc_perplexity(input_ids)
best_model_id = np.where(ppl_array == min(ppl_array))[0][0]
self.set_model_id(best_model_id)
if self.verbose:
print(f"model {best_model_id} will be used")
print("ppl array: ", ppl_array)
ret = self.model.generate(input_ids=input_ids,
attention_mask=attention_mask,
**generate_kwargs)
return ret
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def perplexity(model, tokenized_input) -> torch.Tensor:
with torch.inference_mode():
output = model(tokenized_input.to(device), labels=tokenized_input)
ppl = torch.exp(output.loss)
return ppl.item()
|