Update README.md
Browse files
README.md
CHANGED
@@ -19,7 +19,9 @@ widget:
|
|
19 |
library_name: peft
|
20 |
pipeline_tag: text-generation
|
21 |
---
|
|
|
22 |
|
|
|
23 |
|
24 |
```python
|
25 |
import transformers
|
@@ -129,4 +131,108 @@ response = llama_generate(
|
|
129 |
print(response)
|
130 |
|
131 |
|
132 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
library_name: peft
|
20 |
pipeline_tag: text-generation
|
21 |
---
|
22 |
+
llama-2-7b-hf model finetuned for medical consultation. Works on T4 GPU (16GB VRAM), as well as CPU (32GB RAM)
|
23 |
|
24 |
+
**To run on GPU :**
|
25 |
|
26 |
```python
|
27 |
import transformers
|
|
|
131 |
print(response)
|
132 |
|
133 |
|
134 |
+
```
|
135 |
+
|
136 |
+
**To run on CPU**
|
137 |
+
|
138 |
+
|
139 |
+
```python
|
140 |
+
|
141 |
+
|
142 |
+
import torch
|
143 |
+
import transformers
|
144 |
+
from torch import cuda, bfloat16
|
145 |
+
from peft import PeftModel, PeftConfig
|
146 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
147 |
+
|
148 |
+
|
149 |
+
base_model_id = 'meta-llama/Llama-2-7b-chat-hf'
|
150 |
+
|
151 |
+
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
|
152 |
+
|
153 |
+
bnb_config = transformers.BitsAndBytesConfig(
|
154 |
+
llm_int8_enable_fp32_cpu_offload = True
|
155 |
+
)
|
156 |
+
|
157 |
+
import torch
|
158 |
+
hf_auth = "YOUR-HUGGINGFACE-ACCESS-TOKEN"
|
159 |
+
model_config = transformers.AutoConfig.from_pretrained(
|
160 |
+
base_model_id,
|
161 |
+
use_auth_token=hf_auth
|
162 |
+
)
|
163 |
+
|
164 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
165 |
+
base_model_id,
|
166 |
+
trust_remote_code=True,
|
167 |
+
config=model_config,
|
168 |
+
quantization_config=bnb_config,
|
169 |
+
# device_map='auto',
|
170 |
+
use_auth_token=hf_auth
|
171 |
+
)
|
172 |
+
|
173 |
+
config = PeftConfig.from_pretrained("Ashishkr/llama-2-medical-consultation")
|
174 |
+
model = PeftModel.from_pretrained(model, "Ashishkr/llama-2-medical-consultation").to(device)
|
175 |
+
|
176 |
+
model.eval()
|
177 |
+
print(f"Model loaded on {device}")
|
178 |
+
|
179 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
180 |
+
base_model_id,
|
181 |
+
use_auth_token=hf_auth
|
182 |
+
)
|
183 |
+
|
184 |
+
def llama_generate(
|
185 |
+
model: AutoModelForCausalLM,
|
186 |
+
tokenizer: AutoTokenizer,
|
187 |
+
prompt: str,
|
188 |
+
max_new_tokens: int = 128,
|
189 |
+
temperature: float = 0.92):
|
190 |
+
|
191 |
+
inputs = tokenizer(
|
192 |
+
[prompt],
|
193 |
+
return_tensors="pt",
|
194 |
+
return_token_type_ids=False,
|
195 |
+
).to(
|
196 |
+
device
|
197 |
+
)
|
198 |
+
|
199 |
+
# Check if bfloat16 is supported, otherwise use float16
|
200 |
+
dtype_to_use = torch.float32
|
201 |
+
with torch.autocast("cuda", dtype=dtype_to_use):
|
202 |
+
response = model.generate(
|
203 |
+
**inputs,
|
204 |
+
max_new_tokens=max_new_tokens,
|
205 |
+
temperature=temperature,
|
206 |
+
return_dict_in_generate=True,
|
207 |
+
eos_token_id=tokenizer.eos_token_id,
|
208 |
+
pad_token_id=tokenizer.pad_token_id,
|
209 |
+
)
|
210 |
+
|
211 |
+
decoded_output = tokenizer.decode(
|
212 |
+
response["sequences"][0],
|
213 |
+
skip_special_tokens=True,
|
214 |
+
)
|
215 |
+
|
216 |
+
return decoded_output[len(prompt) :]
|
217 |
+
|
218 |
+
prompt = """
|
219 |
+
instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n
|
220 |
+
|
221 |
+
input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
|
222 |
+
I am having some major bilateral temple pain along with numbness that comes and
|
223 |
+
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
|
224 |
+
but this is different. Also, my moods have been horrible for the past few weeks.\n
|
225 |
+
|
226 |
+
response: """
|
227 |
+
# You can use the function as before
|
228 |
+
response = llama_generate(
|
229 |
+
model,
|
230 |
+
tokenizer,
|
231 |
+
prompt,
|
232 |
+
max_new_tokens=100,
|
233 |
+
temperature=0.92,
|
234 |
+
)
|
235 |
+
|
236 |
+
print(response)
|
237 |
+
|
238 |
+
```
|