Upload handler.py
Browse files- handler.py +4 -4
handler.py
CHANGED
@@ -116,26 +116,26 @@ class MistralAttention(MistralAttention):
|
|
116 |
|
117 |
|
118 |
class EndpointHandler():
|
119 |
-
def __init__(self):
|
120 |
self.instruction = 'Given a web search query, retrieve relevant passages that answer the query:\n'
|
121 |
self.max_length = 4096
|
122 |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
123 |
|
124 |
|
125 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
126 |
self.tokenizer.pad_token = '[PAD]'
|
127 |
self.tokenizer.padding_side = 'left'
|
128 |
|
129 |
bnb_config = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_compute_dtype=torch.float16)
|
130 |
|
131 |
self.model = AutoModel.from_pretrained(
|
132 |
-
|
133 |
quantization_config=bnb_config,
|
134 |
device_map="auto",
|
135 |
trust_remote_code=True,
|
136 |
attn_implementation="eager",
|
137 |
)
|
138 |
-
self.model = PeftModel.from_pretrained(model, '/lora')
|
139 |
self.model.eval()
|
140 |
|
141 |
|
|
|
116 |
|
117 |
|
118 |
class EndpointHandler():
|
119 |
+
def __init__(self, model_dir=''):
|
120 |
self.instruction = 'Given a web search query, retrieve relevant passages that answer the query:\n'
|
121 |
self.max_length = 4096
|
122 |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
123 |
|
124 |
|
125 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
126 |
self.tokenizer.pad_token = '[PAD]'
|
127 |
self.tokenizer.padding_side = 'left'
|
128 |
|
129 |
bnb_config = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_compute_dtype=torch.float16)
|
130 |
|
131 |
self.model = AutoModel.from_pretrained(
|
132 |
+
model_dir,
|
133 |
quantization_config=bnb_config,
|
134 |
device_map="auto",
|
135 |
trust_remote_code=True,
|
136 |
attn_implementation="eager",
|
137 |
)
|
138 |
+
self.model = PeftModel.from_pretrained(self.model, '/lora')
|
139 |
self.model.eval()
|
140 |
|
141 |
|