File size: 2,423 Bytes
c821aa0
b6dcc1d
c821aa0
 
b6dcc1d
c821aa0
 
 
 
b6dcc1d
 
c821aa0
 
 
b6dcc1d
 
 
c821aa0
b6dcc1d
c821aa0
b6dcc1d
 
c821aa0
 
 
 
 
b6dcc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c821aa0
 
 
b6dcc1d
 
 
 
 
 
 
 
 
c821aa0
 
 
 
 
b6dcc1d
 
69bf2b0
 
 
b6dcc1d
 
69bf2b0
b6dcc1d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from transformers import PreTrainedModel
from .MoEConfig import MoEConfig
from transformers import AutoModelForCausalLM
import torch
import numpy as np


class MoeModel(PreTrainedModel):
    config_class = MoEConfig
    verbose = True
    fix_mode = False

    def __init__(self, config):
        super().__init__(config)
        self.model_list = []
        for model_name in self.config_class.model_list:
            self.append_model(model_name)

        self.set_model_id(0)

    """
    def set_model(self, model_name):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
    """

    def append_model(self, model_name):
        print("loading ", model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        self.model_list.append(model)

    # def set_tokenizer(self, tokenizer):
    #    self.tokenizer = tokenizer

    def set_model_id(self, model_id):
        self.model = self.model_list[model_id]

    def calc_perplexity(self, tokenized_input):
        ppl_list = []
        for model in self.model_list:
            ppl_list.append(perplexity(model, tokenized_input))
        return np.array(ppl_list)

    def fix_model(self, model_id):
        self.set_model_id(model_id)
        self.fix_mode = True

    def set_flexible_mode(self):
        self.fix_mode = False

    def generate(self, input_ids, attention_mask,
                 **generate_kwargs):

        if not self.fix_mode:
            ppl_array = self.calc_perplexity(input_ids)
            best_model_id = np.where(ppl_array == min(ppl_array))[0][0]
            self.set_model_id(best_model_id)

            if self.verbose:
                print(f"model {best_model_id} will be used")
                print("ppl array: ", ppl_array)

        ret = self.model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  **generate_kwargs)
        return ret


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def perplexity(model, tokenized_input) -> torch.Tensor:
    with torch.inference_mode():
        output = model(tokenized_input.to(device), labels=tokenized_input)
    ppl = torch.exp(output.loss)
    return ppl.item()