Update README.md
Browse files
README.md
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
---
|
2 |
tags:
|
3 |
- autotrain
|
4 |
-
- conversational
|
5 |
- meta-llama
|
6 |
- meta-llama/Llama-2-7b-hf
|
7 |
inference: true
|
@@ -18,36 +17,56 @@ widget:
|
|
18 |
|
19 |
response: ''
|
20 |
library_name: peft
|
|
|
21 |
---
|
22 |
|
23 |
-
```python
|
24 |
-
!huggingface-cli login
|
25 |
-
|
26 |
-
|
27 |
-
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
28 |
-
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
29 |
-
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
30 |
-
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
31 |
-
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
32 |
-
|
33 |
-
To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
|
34 |
-
Token: <your-hf-access-token>
|
35 |
-
```
|
36 |
-
|
37 |
|
38 |
```python
|
39 |
-
|
40 |
from peft import PeftModel, PeftConfig
|
41 |
-
from transformers import AutoModelForCausalLM
|
42 |
-
from transformers import AutoTokenizer
|
43 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
config = PeftConfig.from_pretrained("Ashishkr/llama2_medical_consultation")
|
48 |
-
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
49 |
model = PeftModel.from_pretrained(model, "Ashishkr/llama2_medical_consultation").to(device)
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
```
|
53 |
|
@@ -60,7 +79,6 @@ def llama_generate(
|
|
60 |
prompt: str,
|
61 |
max_new_tokens: int = 128,
|
62 |
temperature: float = 0.92):
|
63 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
64 |
|
65 |
inputs = tokenizer(
|
66 |
[prompt],
|
@@ -70,7 +88,10 @@ def llama_generate(
|
|
70 |
device
|
71 |
)
|
72 |
|
73 |
-
|
|
|
|
|
|
|
74 |
response = model.generate(
|
75 |
**inputs,
|
76 |
max_new_tokens=max_new_tokens,
|
@@ -87,7 +108,6 @@ def llama_generate(
|
|
87 |
|
88 |
return decoded_output[len(prompt) :]
|
89 |
|
90 |
-
|
91 |
prompt = """
|
92 |
instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n
|
93 |
|
@@ -97,7 +117,7 @@ goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
|
|
97 |
but this is different. Also, my moods have been horrible for the past few weeks.\n
|
98 |
|
99 |
response: """
|
100 |
-
|
101 |
response = llama_generate(
|
102 |
model,
|
103 |
tokenizer,
|
|
|
1 |
---
|
2 |
tags:
|
3 |
- autotrain
|
|
|
4 |
- meta-llama
|
5 |
- meta-llama/Llama-2-7b-hf
|
6 |
inference: true
|
|
|
17 |
|
18 |
response: ''
|
19 |
library_name: peft
|
20 |
+
pipeline_tag: text-generation
|
21 |
---
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
```python
|
25 |
+
import transformers
|
26 |
from peft import PeftModel, PeftConfig
|
27 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
28 |
import torch
|
29 |
+
from torch import cuda, bfloat16
|
30 |
+
|
31 |
+
base_model_id = 'meta-llama/Llama-2-7b-chat-hf'
|
32 |
+
|
33 |
+
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
|
34 |
+
|
35 |
+
bnb_config = transformers.BitsAndBytesConfig(
|
36 |
+
load_in_4bit=True,
|
37 |
+
bnb_4bit_quant_type='nf4',
|
38 |
+
bnb_4bit_use_double_quant=True,
|
39 |
+
bnb_4bit_compute_dtype=bfloat16
|
40 |
+
)
|
41 |
+
|
42 |
+
|
43 |
+
hf_auth = "your-huggingface-access-token"
|
44 |
+
model_config = transformers.AutoConfig.from_pretrained(
|
45 |
+
base_model_id,
|
46 |
+
use_auth_token=hf_auth
|
47 |
+
)
|
48 |
|
49 |
+
model = transformers.AutoModelForCausalLM.from_pretrained(
|
50 |
+
base_model_id,
|
51 |
+
trust_remote_code=True,
|
52 |
+
config=model_config,
|
53 |
+
quantization_config=bnb_config,
|
54 |
+
device_map='auto',
|
55 |
+
use_auth_token=hf_auth
|
56 |
+
)
|
57 |
|
58 |
config = PeftConfig.from_pretrained("Ashishkr/llama2_medical_consultation")
|
|
|
59 |
model = PeftModel.from_pretrained(model, "Ashishkr/llama2_medical_consultation").to(device)
|
60 |
+
|
61 |
+
model.eval()
|
62 |
+
print(f"Model loaded on {device}")
|
63 |
+
|
64 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
65 |
+
model_id,
|
66 |
+
use_auth_token=hf_auth
|
67 |
+
)
|
68 |
+
|
69 |
+
|
70 |
|
71 |
```
|
72 |
|
|
|
79 |
prompt: str,
|
80 |
max_new_tokens: int = 128,
|
81 |
temperature: float = 0.92):
|
|
|
82 |
|
83 |
inputs = tokenizer(
|
84 |
[prompt],
|
|
|
88 |
device
|
89 |
)
|
90 |
|
91 |
+
# Check if bfloat16 is supported, otherwise use float16
|
92 |
+
dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
93 |
+
|
94 |
+
with torch.autocast("cuda", dtype=dtype_to_use):
|
95 |
response = model.generate(
|
96 |
**inputs,
|
97 |
max_new_tokens=max_new_tokens,
|
|
|
108 |
|
109 |
return decoded_output[len(prompt) :]
|
110 |
|
|
|
111 |
prompt = """
|
112 |
instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n
|
113 |
|
|
|
117 |
but this is different. Also, my moods have been horrible for the past few weeks.\n
|
118 |
|
119 |
response: """
|
120 |
+
# You can use the function as before
|
121 |
response = llama_generate(
|
122 |
model,
|
123 |
tokenizer,
|